COD_20230409_HOST_1_QC_analysis

Minsik Kim

2023-04-09

Loading packages

#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
#   knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
knitr::opts_chunk$set(message=FALSE, warning = FALSE)

path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git/"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c(
    "readxl", "phyloseq", "tidyverse", "pacman", "yaml"
)

path_working <- "/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git"
path_library <- "/Library/Frameworks/R.framework/Resources/library"
str_libraries <- c("readxl", "phyloseq", "tidyverse", "pacman", "yaml", "ggplot2", "vegan", "microbiome", "ggpubr", "viridis", "decontam", "gridExtra", "ggpubr", "lme4", "lmerTest", "writexl", "harrietr", "Maaslin2", "ggtext", "ggpmisc", "gridExtra", "gamm4", "reshape2", "kableExtra", "knitr", "ggtree", "car")
        
YAML_header <-
'---
title: "Host-DNA depletion 1: data wrangling"
author: "Minsik Kim"
date: "2032.04.09"
output:
    rmdformats::downcute:
        downcute_theme: "chaos"
        code_folding: hide
        fig_width: 6
        fig_height: 6
---'
seed <- "20230330"

#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
    .libPaths(path_library)

    require(pacman)
    pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))

    knitr::opts_knit$set(root.dir = path_working)

    str_libraries |> unique() |> sort() -> str_libraries
    pacman::p_load(char = str_libraries)

    set.seed(seed)
}
FUN.LineZero.Boot()
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
    cat("Line Zero Environment:\n\n")
    paste("R:", pacman::p_version(), "\n") |> cat()
    cat("Libraries:\n")
    for (str_libraries in str_libraries) {
        paste(
            "    ", str_libraries, ": ", pacman::p_version(package = str_libraries),
            "\n", sep = ""
        ) |> cat()
    }
    paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
    paste("    Library Path:", path_library, "\n") |> cat()
    paste("    Working Path:", path_working, "\n") |> cat()
    paste("Seed:", seed, "\n\n") |> cat()
    cat("YAML Header:\n")
    cat(YAML_header)
}
FUN.LineZero.Report()
## Line Zero Environment:
## 
## R: 4.2.2 
## Libraries:
##     readxl: 1.4.2
##     phyloseq: 1.40.0
##     tidyverse: 2.0.0
##     pacman: 0.5.1
##     yaml: 2.3.7
##     ggplot2: 3.4.1
##     vegan: 2.6.4
##     microbiome: 1.18.0
##     ggpubr: 0.6.0
##     viridis: 0.6.2
##     decontam: 1.16.0
##     gridExtra: 2.3
##     ggpubr: 0.6.0
##     lme4: 1.1.31
##     lmerTest: 3.1.3
##     writexl: 1.4.2
##     harrietr: 0.2.3
##     Maaslin2: 1.10.0
##     ggtext: 0.1.2
##     ggpmisc: 0.5.2
##     gridExtra: 2.3
##     gamm4: 0.2.6
##     reshape2: 1.4.4
##     kableExtra: 1.3.4
##     knitr: 1.42
##     ggtree: 3.4.4
##     car: 3.1.1
## 
## Operating System: Darwin 
##     Library Path: /Library/Frameworks/R.framework/Resources/library 
##     Working Path: /Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/5_Scripts/MGK/Host_depletion_git 
## Seed: 20230330 
## 
## YAML Header:
## ---
## title: "Host-DNA depletion 1: data wrangling"
## author: "Minsik Kim"
## date: "2032.04.09"
## output:
##     rmdformats::downcute:
##         downcute_theme: "chaos"
##         code_folding: hide
##         fig_width: 6
##         fig_height: 6
## ---

Script description

1. Loading data

1.1. phyloseq obejct

1.2. qPCR data (controls)

2. QC

QC1. How many samples failed sequencing

QC2. How were changes in read stats and host DNA proportion?

QC3. How were the extraction controls

QC4. Prevalence / abundance filtering - red flag

3. Analysis

A0. Calculation of alpha-diversity indices

A1. Host DNA, bacterial DNA and % host

A2. Modeling of sequencing results

A3. Taxa alpha diversity

A4. Taxa beta diversity

Intermediate results

A5. DA analysis for taxa

A6. Decontam

A7. LM of function alpha diversity (BPI)

A8. permanova of function alpha diversity

A9. DA for function

Data inputs

Meta data

  • qPCR - bacteria

  • qPCR - human

  • qPCR host %

  • Raw reads

  • final reads

  • sequencing host %

  • library prep failure status

  • Raw reads

  • subject_id

  • treatment

  • sample_type

  • subject_id

Sequencing result

  • samples

  • controls

Loading data

# Loading files -----------------------------------------------------------
#loading tidy phyloseq object
phyloseq <- read_rds("/Users/minsikkim/Dropbox (Partners HealthCare)/Project_SICAS2_microbiome/4_Data/2_Tidy/Phyloseq/PHY_20221129_MGK_host_tidy_tax.rds")


#sample data loading
sample_data <- sample_data(phyloseq$phyloseq_count)

Q1. How were sequencing results?

Figure - regular scale

Raw scale is not normally distributed

# Initail QC --------------------------------------------------------------
        #Quesetions - QC
        
#Q0. How many samples failed in sequencing

## figures -----raw data

sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        gather(feature, value, Raw_reads:sequencing_host_prop) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
        mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host ratio"))) %>%
        ggplot(aes(x = value, fill = treatment)) +
                geom_histogram(bins = 97) +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("log10 transfromed histrogram") +
                theme_classic() +
                theme(legend.position = "top") 

Figure - log10 scale

log transform is adquate for read counts

Host% is not transfromed well

## figures -----log10
sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        mutate(host_seq_percent = 100 * sequencing_host_prop, 
               .after = sequencing_host_prop,) %>% 
        gather(feature, value, Raw_reads:host_seq_percent) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent")) %>%
        mutate(feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "host_seq_percent"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
        ggplot(aes(x = log10(value), fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("log10 transformed") +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                theme_classic() +
                theme(legend.position = "top")

Figure - scaling host proportion

Raw % will be used for host%

## figures -----log10
sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        mutate(host_seq_percent = sequencing_host_prop, 
               log_seq_percent = log10(host_seq_percent), 
               sqrt_seq_percent = sqrt(host_seq_percent), 
               .after = sequencing_host_prop,) %>% 
        gather(feature, value, Raw_reads:sqrt_seq_percent) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent")) %>%
        mutate(feature = factor(feature, levels = c("host_seq_percent", "log_seq_percent", "sqrt_seq_percent"), labels = c("Host ratio", "log10 (host ratio)", "Sqrt(host ratio)")))  %>% 
        ggplot(aes(x = value, fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_grid(sample_type~feature, scales = "free") +
                ggtitle("Host % transfromed (raw, log10, and sqrt) histrogram") +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                theme_classic() +
                theme(legend.position = "top")

Figure - log10 scale by treatment

ggarrange(ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = Final_reads, fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_wrap(~sample_type) +
                theme_classic(base_family = "serif") +
                ggtitle("Histogram of final reads by sample type and treatment") +
                scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment"),
          
             ggplot(sample_data %>% subset(., !is.na(.$subject_id)) %>% data.frame(), aes(x = log10(Final_reads), fill = treatment)) +
                geom_histogram(bins = 97) +
                facet_wrap(~sample_type) +
                theme_classic(base_family = "serif") +
                ggtitle("Histogram of log10(final reads) by sample type and treatment") +
                scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment"),
          common.legend = T, ncol = 1)

Histogram (sum of OTU table)

2 samples showed 0 reads in sum(OTU)

hist((log10((phyloseq$phyloseq_count %>% otu_table %>% colSums()) + 1)),100, main = "Histogram of total reads (sum of OTU table)") # 2 samples showed 0 total reads (sum of otu_table)

Final reads of by sample type

Some samples did not pass library prep QC, but showed reasonable final reads

#how were the samples failed in library prep?
sample_data %>% data.frame %>% mutate(total_read = phyloseq$phyloseq_count %>% otu_table %>% colSums()) %>%
        ggplot(aes(x = reorder(baylor_other_id, -total_read),
                               y = log10(total_read + 1),
                               col = sample_type)) +
                geom_point() +
                theme_classic(base_family = "serif") +
                theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
                ylab("log<sub>10</sub>(Sum of OTU table reads)") +
                xlab("Sample ID") +
        guides(col=guide_legend(title="Library failed")) +
        ggtitle("Sum of OTU reads by sample type")

Final reads of library prep failed samples

Some samples did not pass library prep QC, but showed reasonable final reads

#how were the samples failed in library prep?
sample_data %>% data.frame %>% mutate(total_read = phyloseq$phyloseq_count %>% otu_table %>% colSums()) %>%
        ggplot(aes(x = reorder(baylor_other_id, -total_read),
                               y = log10(total_read + 1),
                               col = lib_failed)) +
                geom_point() +
                theme_classic(base_family = "serif") +
                theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
                ylab("log<sub>10</sub>(Sum of OTU table reads)") +
                xlab("Sample ID") +
        guides(col=guide_legend(title="Library failed")) +
        ggtitle("Sum of OTU reads by library failure status")

Raw reads, Mapped reads, host reads, final reads, and sumOTU

Some samples did not pass library prep QC, but showed reasonable final reads

#how were the samples failed in library prep?
sample_data %>% data.frame %>%
        mutate(total_read = phyloseq$phyloseq_count %>%
                       otu_table %>% colSums()) %>%
        melt(id.vars=c("baylor_other_id"),
             measure.vars=c("Raw_reads", "LowQual_removed", "Reads_after_trim", "Host_mapped", "Final_reads", "Metaphlan_mapped", "total_read"),
    variable.name="category",
    value.name="reads") %>%
        mutate(category = factor(category, levels = c("Raw_reads", "LowQual_removed", "Reads_after_trim", "Host_mapped", "Final_reads", "Metaphlan_mapped", "total_read"),
                                 labels = c("Raw", "Low qual removed", "Trimmed reads","Host", "Final", "Metaphlan", "OTU sum"))) %>%
        ggplot(aes(x = reorder(baylor_other_id, -reads),
                               y = log10(reads + 1),
                               col = category)) +
                geom_point() +
                theme_classic(base_family = "serif") +
                theme(axis.title.y = element_markdown(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4)) +
                ylab("log<sub>10</sub>(reads + 1)") +
                xlab("Sample ID") +
        guides(col=guide_legend(title="Library failed")) +
        ggtitle("Read counts by samples at each data processing step")

List of samples failed in sequencing

2 BAL samples (control and lyPMA group) failed in sequencing

sample_data %>% data.frame %>% filter(phyloseq$phyloseq_count %>% otu_table %>% colSums() == 0) # two BAL sampels showed 0 total reads
#sample_data(phyloseq$phyloseq_count) %>% data.frame() %>% subset(., .$lib_failed)

QC 1 Results:

1.1 Modeling final read should be conducted with log transfrom. Host % need no transformation.

1.2 13 samples failed in library prep

1.3. Two BAL sampels showed 0 total reads

1.4. Sequencing fail ≠ library prep fail

Comments from Baylor:

Q: What was the lab’s criteria for deciding which samples failed library prep.? There were 13 samples that you pointed as failed but their sequencing result actually looks pretty good (ie similar to samples that didn’t fail library prep)

A: To determine whether a library attempt “passed or failed” the lab looks at the picogreen concentrations and a library fragment size distribution trace. The trace files are an output from either the Fragment Analyzer or TapeStation (a copy of the trace files for PQ00331 is attached). If a sample has a background level pico concentration and no discernable fragment concentration on the trace (i.e. a flat trace line) it is considered failed library. If the sample is below the level of detection of our standard library QC methods, it is considered failure. It’s still possible that there is some small amounts of library in those samples that were successfully sequenced, but often those samples do not generate a meaningful amount of sequence data.

QC2 Chagnes of reads and host % by treatment

For detailed analysis, sequencing matrices were analyzed by each sample type and treatment

Reads and host % by treatment

QC table by treated (binary)

Changes in matrices were observed

#sequencing result by sample type and control (1/0)
options(dplyr.summarise.inform = FALSE)

sample_data %>% data.frame() %>% 
        group_by(sample_type, treated) %>% 
        summarise(N = n(),
                  `Raw reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Raw_reads/10000000),2),nsmall = 2, big.mark = ","), " [", format(round(quantile(Raw_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Raw_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Host reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Host_mapped/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Host_mapped/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Host_mapped/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Host reads proportion<br>(median [IQR])<br>[%]` = paste(format(round(median(sequencing_host_prop * 100),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(sequencing_host_prop * 100, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop * 100, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
                  `Final reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
        ) %>%
        rename(`Sample type` = sample_type, Treated = treated) %>%
        data.frame(check.names = F) %>% mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
Sample type Treated N Raw reads
(median [IQR])
[reads x 107]
Host reads
(median [IQR])
[reads x 107]
Host reads proportion
(median [IQR])
[%]
Final reads
(median [IQR])
[reads x 107]
Neg. 0 6 0.20 [0.17, 0.22] 0.02 [0.01, 0.02] 16.85 [7.97, 20.09] 0.08 [0.06, 0.11]
Neg. 1 25 0.22 [0.17, 0.30] 0.02 [0.02, 0.03] 16.70 [14.76, 20.97] 0.10 [0.08, 0.14]
Mock 0 6 10.88 [10.36, 11.02] 0.03 [0.02, 0.03] 0.30 [0.28, 0.31] 10.02 [9.60, 10.21]
Mock 1 25 10.58 [8.16, 11.83] 0.07 [0.06, 0.07] 0.64 [0.63, 0.66] 9.79 [7.29, 10.87]
BAL 0 5 15.73 [6.35, 15.92] 12.92 [5.21, 12.94] 99.72 [99.59, 99.75] 0.03 [0.03, 0.04]
BAL 1 25 6.17 [4.57, 17.43] 4.65 [2.78, 12.80] 95.83 [87.19, 98.81] 0.17 [0.10, 0.37]
Nasal 0 10 13.09 [7.73, 16.93] 10.05 [6.11, 13.04] 94.05 [92.82, 97.87] 0.48 [0.10, 0.87]
Nasal 1 25 4.08 [0.99, 6.40] 0.81 [0.26, 1.36] 32.80 [15.74, 78.71] 0.97 [0.17, 3.42]
Sputum 0 5 8.59 [8.25, 9.27] 6.87 [6.69, 7.50] 99.19 [98.86, 99.21] 0.06 [0.06, 0.09]
Sputum 1 25 12.23 [10.34, 13.73] 7.71 [3.76, 8.82] 87.45 [47.33, 92.94] 1.16 [0.47, 4.19]

QC table by treatment methods

Changes were sample type * treatment specific

sample_data %>% data.frame() %>% 
        #dplyr::filter(sample_type %in% c("Sputum", "nasal_swab", "BAL")) %>% 
        group_by (sample_type, treatment) %>%
        summarise(N = n(),
              `Raw reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Raw_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Raw_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Raw_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Host reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Host_mapped/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Host_mapped/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Host_mapped/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Host reads proportion<br>(median [IQR])<br>[%]` = paste(format(round(median(sequencing_host_prop * 100),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(sequencing_host_prop * 100, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(sequencing_host_prop * 100, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
              `Final reads<br>(median [IQR])<br>[reads x 10<sup>7</sup>]` = paste(format(round(median(Final_reads/10000000),2), nsmall = 2, big.mark = ","), " [", format(round(quantile(Final_reads/10000000, 0.25),2), nsmall = 2, big.mark = ","), ", ", format(round(quantile(Final_reads/10000000, 0.75),2), nsmall = 2, big.mark = ","), "]", sep = ""),
        ) %>% data.frame(check.names = F) %>% 
        arrange(sample_type, treatment) %>%
        rename(`Sample type` = sample_type, Treatment = treatment) %>%
        mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
Sample type Treatment N Raw reads
(median [IQR])
[reads x 107]
Host reads
(median [IQR])
[reads x 107]
Host reads proportion
(median [IQR])
[%]
Final reads
(median [IQR])
[reads x 107]
Neg. Untreated 6 0.20 [0.17, 0.22] 0.02 [0.01, 0.02] 16.85 [7.97, 20.09] 0.08 [0.06, 0.11]
Neg. lyPMA 5 0.16 [0.15, 0.19] 0.01 [0.01, 0.01] 16.55 [14.92, 16.70] 0.07 [0.07, 0.08]
Neg. Benzonase 5 0.19 [0.17, 0.22] 0.02 [0.02, 0.05] 17.49 [15.45, 27.40] 0.09 [0.08, 0.10]
Neg. Host zero 5 0.29 [0.24, 0.40] 0.02 [0.02, 0.04] 13.14 [10.22, 28.16] 0.13 [0.10, 0.18]
Neg. Molysis 5 0.19 [0.18, 0.22] 0.03 [0.02, 0.03] 19.94 [16.61, 19.95] 0.10 [0.09, 0.13]
Neg. QIAamp 5 0.30 [0.25, 0.31] 0.02 [0.02, 0.03] 17.61 [13.26, 20.10] 0.12 [0.10, 0.14]
Mock Untreated 6 10.88 [10.36, 11.02] 0.03 [0.02, 0.03] 0.30 [0.28, 0.31] 10.02 [9.60, 10.21]
Mock lyPMA 5 2.07 [0.51, 8.23] 0.09 [0.03, 0.10] 4.91 [1.42, 7.75] 1.61 [0.31, 7.29]
Mock Benzonase 5 11.32 [8.01, 11.55] 0.07 [0.05, 0.07] 0.64 [0.63, 0.64] 10.30 [7.27, 10.52]
Mock Host zero 5 10.06 [9.64, 12.26] 0.06 [0.06, 0.07] 0.65 [0.63, 0.65] 9.15 [8.81, 11.25]
Mock Molysis 5 10.53 [8.16, 10.58] 0.06 [0.05, 0.06] 0.64 [0.64, 0.65] 9.58 [7.45, 9.79]
Mock QIAamp 5 11.97 [11.83, 13.59] 0.07 [0.07, 0.07] 0.64 [0.61, 0.64] 11.04 [10.87, 12.32]
BAL Untreated 5 15.73 [6.35, 15.92] 12.92 [5.21, 12.94] 99.72 [99.59, 99.75] 0.03 [0.03, 0.04]
BAL lyPMA 5 5.72 [3.59, 13.41] 4.65 [2.79, 10.90] 99.08 [97.84, 99.46] 0.06 [0.04, 0.10]
BAL Benzonase 5 18.59 [16.20, 23.63] 14.77 [12.80, 18.16] 98.81 [98.72, 98.92] 0.17 [0.16, 0.22]
BAL Host zero 5 4.57 [2.32, 4.71] 2.69 [1.61, 2.93] 83.65 [76.75, 87.19] 0.24 [0.13, 0.82]
BAL Molysis 5 4.76 [3.57, 4.86] 2.78 [1.39, 3.61] 92.52 [92.48, 93.61] 0.29 [0.13, 1.56]
BAL QIAamp 5 17.19 [15.35, 17.43] 11.87 [10.79, 12.22] 98.35 [92.28, 98.57] 0.26 [0.10, 1.02]
Nasal Untreated 10 13.09 [7.73, 16.93] 10.05 [6.11, 13.04] 94.05 [92.82, 97.87] 0.48 [0.10, 0.87]
Nasal lyPMA 5 0.98 [0.85, 1.24] 0.63 [0.28, 0.88] 91.25 [35.63, 91.64] 0.07 [0.06, 0.08]
Nasal Benzonase 5 5.75 [4.95, 6.57] 3.66 [1.29, 5.05] 78.71 [77.84, 94.79] 0.28 [0.26, 1.04]
Nasal Host zero 5 2.83 [1.42, 6.42] 0.49 [0.03, 0.81] 8.92 [2.70, 30.39] 2.43 [0.97, 5.03]
Nasal Molysis 5 0.99 [0.63, 4.08] 0.42 [0.06, 0.64] 49.94 [5.04, 78.44] 0.32 [0.17, 2.53]
Nasal QIAamp 5 6.40 [6.40, 6.80] 0.86 [0.86, 1.17] 20.06 [15.74, 23.21] 4.63 [4.50, 4.67]
Sputum Untreated 5 8.59 [8.25, 9.27] 6.87 [6.69, 7.50] 99.19 [98.86, 99.21] 0.06 [0.06, 0.09]
Sputum lyPMA 5 10.98 [5.22, 12.78] 8.82 [3.76, 10.44] 96.38 [92.54, 98.28] 0.25 [0.15, 0.44]
Sputum Benzonase 5 10.76 [10.34, 10.82] 7.81 [7.75, 8.24] 94.19 [92.94, 94.47] 0.47 [0.45, 0.59]
Sputum Host zero 5 13.14 [7.64, 13.95] 4.39 [3.80, 7.71] 61.67 [37.50, 68.00] 2.91 [2.36, 3.67]
Sputum Molysis 5 12.59 [10.84, 13.73] 2.98 [1.83, 4.28] 32.79 [17.02, 33.83] 6.11 [5.56, 8.37]
Sputum QIAamp 5 12.35 [12.23, 12.85] 9.08 [8.41, 9.27] 88.18 [68.85, 88.64] 1.16 [1.13, 3.89]

Figure of reads by treatment (z-score)

Changes were sample type * treatment specific

# Summary figures - facet and z-score -------------------------------------

sample_data %>% 
        subset(., !is.na(.$subject_id)) %>%
        data.frame() %>%
        gather(feature, value, Raw_reads:sequencing_host_prop) %>%
        group_by(feature, sample_type) %>% 
        subset(., .$feature %in% c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop")) %>%
        mutate(z_score = scale(value),
               feature = factor(feature, levels = c("Raw_reads", "Host_mapped", "Final_reads", "sequencing_host_prop"), labels = c("Raw reads", "Host mapped", "Final reads", "Host %"))) %>%
        ggplot(aes(x = treatment, y = z_score, fill = treatment)) +
                geom_boxplot(lwd = 0.2) +
                guides(fill=guide_legend(title="Treatment", nrow = 1)) +
                facet_grid(sample_type~feature) +
                xlab("Treatment") +
                ylab("Z score") +
                theme_classic(base_family = "serif", base_size = 14) +
                guides( x =  guide_axis(angle = 90)) + 
                theme(legend.position = "top") +
                scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment") #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6

Results:

2.1. There were no differences in raw reads.

2.2. However, final reads increased after some treatment, and host DNA proportion decreased

QC3. Positive and negative controls

Positive and negative controls were compared with mock community

Reads and host % by treatment

Species richness of controls

Some possible contaminants were identified in extraction controls

#Loading theoretical mock community
zymo_mock <- read_excel("/Users/minsikkim/Dropbox (Partners HealthCare)/@minsik/project_sicas2/data_raw/DAR_20210929_zymo_mock_data.xlsx") %>%
        data.frame(row.names = T) %>% rename(mock_theoretical = Mock) %>% mutate(mock_theoretical = mock_theoretical/100) %>%
        merge_phyloseq(otu_table(., taxa_are_rows = T), tax_table(phyloseq$phyloseq_count))

phyloseq_mock <- rbind(c("mock_theoretical", "Mock theoretical", "-")) %>% data.frame() %>%
        column_to_rownames(var = "X1") %>% rename(sample_type = X2, treatment = X3) %>% #making sample_data of mock community
        merge_phyloseq(sample_data(.), zymo_mock)

phyloseq_control_rel <- subset_samples(phyloseq$phyloseq_rel, sample_type == "Mock" | sample_type == "Neg.") #adding data of controls
sample_data(phyloseq_control_rel)$treatment <- sample_data(phyloseq_control_rel)$treatment %>% as.character()
sample_data(phyloseq_control_rel)$sample_type <- sample_data(phyloseq_control_rel)$sample_type %>% as.character()
phyloseq_control_rel <- merge_phyloseq(phyloseq_control_rel, phyloseq_mock) 


#Species richness of each control groups
sample_data(phyloseq_control_rel)$S.obs <- rowSums(t(otu_table(phyloseq_control_rel)) != 0)
sample_data(phyloseq_control_rel)$sample_type <- 
        factor(sample_data(phyloseq_control_rel)$sample_type, levels = c("Mock theoretical", "Mock", "Neg."))
sample_data(phyloseq_control_rel)$teratment <-
        factor(sample_data(phyloseq_control_rel)$treatment, levels = c("-", "Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"))
phyloseq_control_rel %>%
        sample_data() %>%
        #mutate(sample_type = factor(sample_type, levels = c("Mock", "Neg.")),
        #       treatment = factor(treatment, levels = c("Theoretical", "Untreated", "Benzonase", "Host zero", "Molysis", "QIAamp"))) %>%
        group_by(sample_type, treatment) %>%
        summarise(Mean = mean(S.obs),
                  SD = sd(S.obs),) %>%
        kbl(format = "html", caption = "Species richness of controls") %>% 
        kable_styling(full_width = 0, html_font = "serif")
Species richness of controls
sample_type treatment Mean SD
Mock theoretical
10.000000 NA
Mock Benzonase 24.400000 1.1401754
Mock Host zero 27.000000 7.8421936
Mock Molysis 28.400000 1.1401754
Mock QIAamp 25.800000 0.4472136
Mock Untreated 41.333333 1.5055453
Mock lyPMA 39.800000 15.9279628
Neg. Benzonase 6.800000 2.3874673
Neg. Host zero 9.400000 2.1908902
Neg. Molysis 8.200000 3.3466401
Neg. QIAamp 9.200000 3.4205263
Neg. Untreated 8.333333 7.9162281
Neg. lyPMA 11.000000 12.3490890

Bar plot of controls

Some possible contaminants were identified in extraction controls

Some changes visible at postive control….

#Manipulating phyloseq - only top 10 
tax_table(phyloseq_control_rel) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq_control_rel) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq_control_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="species20") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Bar plot of control data") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ sample_type, scales= "free_x", nrow=1)

#there could be opportunistic pathogens...

Bar plot of controls (Positive)

Some possible contaminants were identified in extraction controls

Gram negatives were fragile to depletion method at postivive control

#Manipulating phyloseq - only top 10 

phyloseq_control_rel %>% 
        subset_samples(., sample_type == "Mock") %>% 
        tax_table() %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="species20") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Postive controls") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

#there could be opportunistic pathogens...

Issue 1 - gram negative

Benzonse and Host zero depleted all the gram negative strains

Others decrased gram negatives a lot, but not became zero

# gram stain data


phyloseq_control_rel %>% 
        subset_samples(., sample_type == "Mock") %>% 
        tax_table() %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="Gram") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Gram stain in Zymo mock") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Gram-stain")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

#there could be opportunistic pathogens...
                                
#Manipulating phyloseq - only top 10 

sample_data(phyloseq_control_rel) <- cbind(phyloseq_control_rel %>%
              sample_data %>%
              data.frame(),
      phyloseq_control_rel %>% 
        otu_table %>%
        data.frame %>%
        subset(., rownames(.) %in% head(taxa_sums(subset_samples(phyloseq_control_rel,sample_type == "Mock" & S.obs != 0)) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)) %>%
        t()
)


sample_data(phyloseq_control_rel) %>%
        data.frame() %>%
        subset(., !is.na(.$Escherichia_coli)) %>% 
        group_by(sample_type, treatment) %>% 
        summarise(N = n(),
                  `<i>Escherichia coli</i><br>(median [IQR])` = paste(format(round(median(Escherichia_coli),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Escherichia_coli, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Escherichia_coli, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = ""),
                  `<i>Pseudomonas aeruginosa</i><br>(median [IQR])` = paste(format(round(median(Pseudomonas_aeruginosa_group),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Pseudomonas_aeruginosa_group, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Pseudomonas_aeruginosa_group, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = ""),
                  `<i>Salmonella enterica</i><br>(median [IQR])` = paste(format(round(median(Salmonella_enterica),4),nsmall = 4, big.mark = ","), " [", format(round(quantile(Salmonella_enterica, 0.25),4), nsmall = 4, big.mark = ","), ", ", format(round(quantile(Salmonella_enterica, 0.75),4), nsmall = 4, big.mark = ","), "]", sep = "")
        ) %>%
        rename(`Sample type` = sample_type) %>%
        data.frame(check.names = F) %>% mutate_all(linebreak) %>% kbl(format = "html", escape = F) %>% kable_styling(full_width = 0, html_font = "serif")
Sample type treatment N Escherichia coli
(median [IQR])
Pseudomonas aeruginosa
(median [IQR])
Salmonella enterica
(median [IQR])
Mock theoretical
1 0.1200 [0.1200, 0.1200] 0.1200 [0.1200, 0.1200] 0.1200 [0.1200, 0.1200]
Mock Benzonase 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Mock Host zero 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Mock Molysis 5 0.0029 [0.0022, 0.0044] 5e-04 [2e-04, 6e-04] 0.0031 [0.0022, 0.0036]
Mock QIAamp 5 5e-04 [5e-04, 0.0010] 1e-04 [1e-04, 1e-04] 5e-04 [5e-04, 7e-04]
Mock Untreated 6 0.3118 [0.2964, 0.3231] 0.0791 [0.0771, 0.0810] 0.2405 [0.2314, 0.2467]
Mock lyPMA 5 0.1514 [0.1427, 0.2281] 0.0477 [0.0443, 0.0576] 0.1168 [0.1035, 0.1840]
Neg. Benzonase 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Neg. Host zero 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Neg. Molysis 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Neg. QIAamp 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Neg. Untreated 6 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
Neg. lyPMA 5 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]

Issue 2 - Positive controls contaminants

Some possible contaminants were identified in most of samples

This could be 1) background contamination or 2) cross-contamination from kingfisher. Most of these are gram positives. Negative controls should be double-checked

#Manipulating phyloseq - only top 10 
phyloseq_control_rel_contam <- subset_taxa(phyloseq_control_rel , !(taxa_names(phyloseq_control_rel) %in% head(taxa_sums(subset_samples(phyloseq_control_rel,
                                                                                         sample_type == "Mock" & S.obs != 0)) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10))
)

phyloseq_control_rel_contam <- subset_taxa(phyloseq_control_rel_contam, taxa_sums(phyloseq_control_rel_contam) != 0)
phyloseq_control_rel_contam <- subset_samples(phyloseq_control_rel_contam, sample_type != "Neg." & S.obs != 0)


tax_table(phyloseq_control_rel_contam) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq_control_rel_contam) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq_control_rel_contam
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="species20") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Contaminants in Zymo mock") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

Negative controls

Contaminants of positive and negative control do not match

It seems without host DNA, gram-negatives are vulnerable to depletion methods.

These negative contaminants highly likely introduced after-depletion

#Manipulating phyloseq - only top 10 

tax_table(phyloseq_control_rel) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq_control_rel) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq_control_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        subset_samples(., sample_type == "Neg.") %>%
        plot_bar(., fill="species20") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Barplot of neg. data") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

phyloseq_control_rel %>%
        subset_samples(., sample_type == "Neg.") %>%
        plot_bar(., fill="Gram") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Gram-stain of negative data") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

Samples - gram-stain?

Nasal swab had low gram positive/pattern consisted after depletion

BAL showed Similar gram - / + ratio

Sputum showed high decrease in gram negative bacteria

Freeze/thaw cycle could be associated

Currently no further analysis is possible

#Manipulating phyloseq - only top 10 

tax_table(phyloseq$phyloseq_rel) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq$phyloseq_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        subset_samples(., sample_type == "Nasal") %>%
        plot_bar(., fill="Gram") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Gram stain of nasal samples") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

tax_table(phyloseq$phyloseq_rel) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq$phyloseq_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        subset_samples(., sample_type == "BAL") %>%
        plot_bar(., fill="Gram") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Gram stain of BAL samples") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

tax_table(phyloseq$phyloseq_rel) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq$phyloseq_rel) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 10)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 9] <- .[, 9] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq$phyloseq_rel
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        subset_samples(., sample_type == "Sputum") %>%
        plot_bar(., fill="Gram") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Gram stain of sputum samples") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 10 species")) +
        facet_wrap (~ factor(treatment, levels = c("Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp")),
                    scales= "free_x", nrow=1)

Results

2.3.1. Negative control showed minimal number of possible contaminants

2.3.2. Positive control contained various contaminants

QC4. Prevalence and abundacne filtering - red flag

Taxa prevance and abundance were checked.

Taxa abundance and prevalence

Histogram of prelanence taxa

No prevalence or abundance filtering (each experimental group is 5% of total sample)

#Calculation of sample prevalence, standard deviation, median abundance across all samples for all bugs and making into a table.
#
#•  In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differential abundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)
taxa_qc <- data.frame("species" =  otu_table(phyloseq$phyloseq_rel) %>% t() %>% colnames(),
        "prevalence" = ifelse(phyloseq$phyloseq_count %>% otu_table() > 0, 1, 0) %>% t() %>% colSums(), #Prevalence of taxa
        "mean_rel_abd" = phyloseq$phyloseq_rel %>% otu_table() %>% t() %>% colMeans(na.rm = T) #mean relativ abundacne 
)

hist(log10(taxa_qc$prevalence), xlab = "log10(Taxa prevalence)", main = "Histogram of prevalence of taxa")

Histogram of mean abundance

hist(log10(taxa_qc$mean_rel_abd), xlab = "log10(Mean relative abundance)", main = "Histogram of mean relative abundance")

Red flag taxa

Taxa with low prevalences were red-flagged

red_flag_taxa <- data.frame(species = taxa_qc$species,
                          red_flag_prev_abd = ifelse(taxa_qc$prevalence < otu_table(phyloseq$phyloseq_rel) %>% t %>% rownames() %>% length * 0.05 & taxa_qc$mean_rel_abd < quantile(taxa_qc$mean_rel_abd, 0.75), 1, 0))
red_flag_taxa

QC 3 results:

3.1. In initial analysis we will not perform prevalence or abundance filtering (though we may consider this for secondary differentialabundance analyses to manage p (features) > n (sample size) problem and issues with multiple hypothesis correction)

3.2. Red flags were made for taxa not satisfying the criteria (prev < 0.05 & mean rel < 0.75Q)

3.3. Although we don’t consider the prevalence of abundance at this time, we can consider their red-flags after running the DA analysis

Analysis

Before anlayzing, alpha diversity indices were calculated for all phyloseq objects

alpha_diversity <- function(data) {
        otu_table <- otu_table(data) %>% .[colSums(.) !=0]
        S.obs <- rowSums(t(otu_table) != 0)
        sample_data <- sample_data(data)
        data_evenness <- vegan::diversity(t(otu_table)) / log(vegan::specnumber(t(otu_table))) # calculate evenness index using vegan package
        data_shannon <- vegan::diversity(t(otu_table), index = "shannon") # calculate Shannon index using vegan package
        data_hill <- exp(data_shannon)                           # calculate Hills index
        
        data_dominance <- microbiome::dominance(otu_table, index = "all", rank = 1, aggregate = TRUE) # dominance (Berger-Parker index), etc.
        data_invsimpson <- vegan::diversity(t(otu_table), index = "invsimpson")                          # calculate Shannon index using vegan package
        alpha_diversity <- cbind(S.obs, data_shannon, data_hill, data_invsimpson, data_evenness,data_dominance) # combine all indices in one data table
        sample_data <- merge(data.frame(sample_data), alpha_diversity, by = 0, all = T) %>% column_to_rownames(var = "Row.names")
}

sample_data(phyloseq$phyloseq_rel) <- sample_data(alpha_diversity(phyloseq$phyloseq_count))
sample_data(phyloseq$phyloseq_count) <- sample_data(alpha_diversity(phyloseq$phyloseq_count)) 
sample_data(phyloseq$phyloseq_path_rpkm) <- sample_data(alpha_diversity(phyloseq$phyloseq_path_rpkm))

A1. Host DNA, bacterial DNA by smaple type and treatment

qPCR and sequencing results

qPCR result

#2A: Change in total DNA (qPCR)
f2a <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_host_nondil + DNA_bac_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("log<sub>10</sub>(qPCR total DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        #scale_x_discrete(label = c( "Mock", "Neg.", "BAL", "Nasal", "Sputum")) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))


#2B: Change in human DNA (qPCR)
f2b <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_host_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
        ylab("log<sub>10</sub>(qPCR host DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "B") +
        #scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))
#2C: Change in 16S DNA (qPCR)
f2c <- ggplot(sample_data, aes(x = sample_type, y = log10(DNA_bac_nondil))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
        ylab("log<sub>10</sub>(qPCR bacterial DNA)<br>(ng/μL)") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif")+ 
        labs(tag = "C") +
        #scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))

#2D. Change in % host (qPCR)
f2d <- ggplot(sample_data, aes(x = sample_type, y = host_proportion)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3")) +
        ylab("Host DNA ratio") +
        xlab("Sample type") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "D") +
        #scale_x_discrete(label = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
        theme(plot.tag = element_text(size = 15), axis.title.y = element_markdown()) +              # Plot title size
        guides(fill = guide_legend(nrow = 1, title = "Treatment"))

#output for markdown
ggarrange(f2a, f2b, f2c, f2d, common.legend = T , align = "hv")

Figure 2. qPCR result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %

Sequencing result

f3a <- ggplot(sample_data, aes(x = sample_type, y = log10(Raw_reads))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        theme_classic (base_size = 12, base_family = "serif") + 
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type") +
        theme(axis.title.y = element_markdown(),
              plot.tag = element_text(size = 15)) +
        ylab("log<sub>10</sub>(raw reads)") +
        labs(tag = "A") +
        guides(fill = guide_legend(nrow = 1))

#   - Host_mapped


f3b <- ggplot(sample_data, aes(x = sample_type, y = log10(Host_mapped))) +
        theme_classic (base_size = 12, base_family = "serif")+ 
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        theme(axis.title.y = element_markdown(),
              plot.tag = element_text(size = 15)) +
        ylab("log<sub>10</sub>(host reads)") +
        labs(tag = "B") +
        guides(fill = guide_legend(nrow = 1))


#   - % Host (we have used Host_mapped/Raw_reads in prior papers)

#   - Final_reads

f3c <- ggplot(sample_data, aes(x = sample_type, y = log10(Final_reads))) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        ylab("log<sub>10</sub>(final reads)") +
        theme_classic (base_size = 12, base_family = "serif") + 
        theme(axis.title.y = element_markdown(),
              plot.tag = element_markdown(size = 15)) +
        labs(tag = "C") +
        guides(fill = guide_legend(nrow = 1))


#   - % Host (we have used Host_mapped/Raw_reads in prior papers)
f3d <- ggplot(sample_data, aes(x = sample_type, y = sequencing_host_prop)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        theme_classic (base_size = 12, base_family = "serif")+ 
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_x_discrete(name ="Sample type")+
        theme(axis.title.y = element_markdown(),
              plot.tag = element_text(size = 15)) +
        ylab("Host ratio by sequencing") +
        labs(tag = "D") +
        guides(fill = guide_legend(nrow = 1))



ggarrange(f3a, f3b, f3c, f3d, common.legend = T, align = "hv")

Figure 3. Sequencing result of host depletion study. A. Total DNA B. Host DNA C. Bacterial DNA D. Host %

Results A1

1.1. Some changed were observed, for both host DNA and bacterial DNA.

1.2. Sequencing results need to be added

This will be Fig 2. of the manuscript, after removing positives and negatives

A2. Modeling on sequencing results

As some changed were observed after treatment, linear mixed effect models were employed for testing.

Test results

Library failure - ANOVA

Some samples failed in library prep. What type of sample were fragile to treatments?

glm ( library fail ~ sample_type + treatment + sample_type * treatment + subject_id )

glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        Anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>Chisq)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Chisq Df Pr(>Chisq)
sample_type 20.19828 4 0.0004563
treatment 41.67568 5 0.0000001
sample_type * treatment 92.13915 20 0.0000000

Library failure

glm ( sequencing fail ~ sample_type + treatment + sample_type * treatment + subject_id )

Nasals were fragile to lyPMA and Molysis

glmer(lib_failed ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error t value
(Intercept) 0.0000000 0.1317706 0.0000000
Mock 0.0000000 0.1863518 0.0000000
BAL 0.0000000 0.1630767 0.0000000
Nasal 0.0000000 0.1482523 0.0000000
Sputum 0.0000000 0.1630767 0.0000000
lyPMA 0.0000000 0.1125488 0.0000000
Benzonase 0.0000000 0.1125488 0.0000000
Host zero 0.0000000 0.1125488 0.0000000
Molysis 0.0000000 0.1125488 0.0000000
QIAamp 0.0000000 0.1125488 0.0000000
Mock * lyPMA 0.0000000 0.1591681 0.0000000
BAL * lyPMA 0.2000000 0.1627453 1.2289140
Nasal * lyPMA 0.8187152 0.1541029 5.3127843
Sputum * lyPMA 0.0000000 0.1627453 0.0000000
Mock * Benzonase 0.0000000 0.1591681 0.0000000
BAL * Benzonase 0.0000000 0.1627453 0.0000000
Nasal * Benzonase -0.0424433 0.1542275 -0.2751992
Sputum * Benzonase 0.0000000 0.1627453 0.0000000
Mock * Host zero 0.0000000 0.1591681 0.0000000
BAL * Host zero 0.2000000 0.1627453 1.2289140
Nasal * Host zero 0.3575567 0.1542275 2.3183715
Sputum * Host zero 0.0000000 0.1627453 0.0000000
Mock * Molysis 0.0000000 0.1591681 0.0000000
BAL * Molysis 0.2000000 0.1627453 1.2289140
Nasal * Molysis 0.7812848 0.1541029 5.0698914
Sputum * Molysis 0.0000000 0.1627453 0.0000000
Mock * QIAamp 0.0000000 0.1591681 0.0000000
BAL * QIAamp 0.0000000 0.1627453 0.0000000
Nasal * QIAamp 0.0424433 0.1542275 0.2751992
Sputum * QIAamp 0.0000000 0.1627453 0.0000000

Sequencing failure

Modeling of sequencing failure were not available due to low number of cases.

BAL079 - control & lyPMA failed sequencing.

sample_data(phyloseq$phyloseq_count) %>% data.frame %>% mutate(sequencing_fail = (S.obs == 0)) %>%
glmer(sequencing_fail ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = .) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`t value`) > 2 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error t value
(Intercept) 0.0 0.0765611 0.000000
Mock 0.0 0.1082738 0.000000
BAL 0.2 0.0928632 2.153706
Nasal 0.0 0.0851034 0.000000
Sputum 0.0 0.0928632 0.000000
lyPMA 0.0 0.0591372 0.000000
Benzonase 0.0 0.0591372 0.000000
Host zero 0.0 0.0591372 0.000000
Molysis 0.0 0.0591372 0.000000
QIAamp 0.0 0.0591372 0.000000
Mock * lyPMA 0.0 0.0836326 0.000000
BAL * lyPMA 0.0 0.0855122 0.000000
Nasal * lyPMA 0.0 0.0811995 0.000000
Sputum * lyPMA 0.0 0.0855122 0.000000
Mock * Benzonase 0.0 0.0836326 0.000000
BAL * Benzonase -0.2 0.0855122 -2.338848
Nasal * Benzonase 0.0 0.0812882 0.000000
Sputum * Benzonase 0.0 0.0855122 0.000000
Mock * Host zero 0.0 0.0836326 0.000000
BAL * Host zero -0.2 0.0855122 -2.338848
Nasal * Host zero 0.0 0.0812882 0.000000
Sputum * Host zero 0.0 0.0855122 0.000000
Mock * Molysis 0.0 0.0836326 0.000000
BAL * Molysis -0.2 0.0855122 -2.338848
Nasal * Molysis 0.0 0.0811995 0.000000
Sputum * Molysis 0.0 0.0855122 0.000000
Mock * QIAamp 0.0 0.0836326 0.000000
BAL * QIAamp -0.2 0.0855122 -2.338848
Nasal * QIAamp 0.0 0.0812882 0.000000
Sputum * QIAamp 0.0 0.0855122 0.000000

log10(Final reads) - ANOVA

Which methods was effective in increasing the final reads?

Interaction term was significant

lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 7.219703 1.8049257 4 10.20498 11.573353 0.0008358
treatment 16.897219 3.3794439 5 113.82452 21.669311 0.0000000
sample_type * treatment 14.273189 0.7136594 20 111.51262 4.576051 0.0000001

log10(Final reads)

Which methods was effective in increasing the final reads?

lmer( log10(Final reads) vs sample_type + treatment + sample_type * treatment + subject_id )

Except lyPMA, every methods increased final reads

lmer(log10(Final_reads) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 5.8969160 0.2896787 13.69170 20.3567474 0.0000000
Mock 2.0643827 0.4096675 13.69170 5.0391660 0.0001936
BAL -0.4036097 0.3559340 21.62209 -1.1339451 0.2692339
Nasal 0.6201093 0.3245018 17.83245 1.9109582 0.0722216
Sputum -0.0703197 0.3559340 21.62209 -0.1975637 0.8452319
lyPMA 0.3624825 0.2391309 110.63626 1.5158327 0.1324137
Benzonase 0.0678140 0.2391309 110.63626 0.2835852 0.7772583
Host zero 0.2506613 0.2391309 110.63626 1.0482178 0.2968228
Molysis 0.1413267 0.2391309 110.63626 0.5910015 0.5557249
QIAamp 0.1588315 0.2391309 110.63626 0.6642031 0.5079422
Mock * lyPMA -1.1298459 0.3381822 110.63626 -3.3409380 0.0011392
BAL * lyPMA -0.0103440 0.3457828 110.63626 -0.0299146 0.9761890
Nasal * lyPMA -0.9035047 0.3277361 116.97484 -2.7568060 0.0067736
Sputum * lyPMA 0.1775924 0.3457828 110.63626 0.5135953 0.6085591
Mock * Benzonase -0.0827988 0.3381822 110.63626 -0.2448348 0.8070379
BAL * Benzonase 0.7430909 0.3457828 110.63626 2.1490108 0.0338131
Nasal * Benzonase 0.0750795 0.3280318 117.54161 0.2288786 0.8193610
Sputum * Benzonase 0.7780564 0.3457828 110.63626 2.2501306 0.0264186
Mock * Host zero -0.2212240 0.3381822 110.63626 -0.6541561 0.5143687
BAL * Host zero 0.6995291 0.3457828 110.63626 2.0230305 0.0454812
Nasal * Host zero 0.6045330 0.3280318 117.54161 1.8429094 0.0678619
Sputum * Host zero 1.4226694 0.3457828 110.63626 4.1143443 0.0000750
Mock * Molysis -0.1604345 0.3381822 110.63626 -0.4744026 0.6361479
BAL * Molysis 0.8944771 0.3457828 110.63626 2.5868180 0.0109839
Nasal * Molysis 0.0690814 0.3277361 116.97484 0.2107838 0.8334227
Sputum * Molysis 1.8487779 0.3457828 110.63626 5.3466454 0.0000005
Mock * QIAamp -0.0570843 0.3381822 110.63626 -0.1687973 0.8662643
BAL * QIAamp 0.8892670 0.3457828 110.63626 2.5717504 0.0114447
Nasal * QIAamp 0.8921752 0.3280318 117.54161 2.7197824 0.0075237
Sputum * QIAamp 1.2569571 0.3457828 110.63626 3.6351063 0.0004232

Host ratio ANOVA

Which methods was effective in lowering host %

Interaction term was significant

lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 1.140989 0.2852472 4 12.90241 19.94979 1.96e-05
treatment 1.844330 0.3688660 5 112.48557 25.79798 0.00e+00
sample_type * treatment 3.013480 0.1506740 20 111.00500 10.53793 0.00e+00

Host ratio

Which methods was effective in lowering host %

lmer( Host DNA ratio vs sample_type + treatment + sample_type * treatment + (1|subject_id) )

Host zero was effect to to all types. Molysis was effective to Nasal and sputum. QIAamp was effective for Nasal only.

lmer(sequencing_host_prop ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x))  %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.1580489 0.1236466 14.44554 1.2782307 0.2213255
Mock -0.1555157 0.1748628 14.44554 -0.8893586 0.3883812
BAL 0.8383419 0.1439765 18.45998 5.8227692 0.0000147
Nasal 0.7942119 0.1341971 16.58294 5.9182492 0.0000187
Sputum 0.8317866 0.1439765 18.45998 5.7772389 0.0000162
lyPMA -0.0125376 0.0724064 109.71034 -0.1731564 0.8628476
Benzonase 0.0820089 0.0724064 109.71034 1.1326192 0.2598435
Host zero 0.0513961 0.0724064 109.71034 0.7098273 0.4793172
Molysis 0.0340392 0.0724064 109.71034 0.4701129 0.6392075
QIAamp 0.0122709 0.0724064 109.71034 0.1694722 0.8657373
Mock * lyPMA 0.0665440 0.1023982 109.71034 0.6498555 0.5171439
BAL * lyPMA -0.0186970 0.1046995 109.71034 -0.1785781 0.8585983
Nasal * lyPMA -0.2655229 0.1001256 114.99868 -2.6518985 0.0091339
Sputum * lyPMA -0.0255115 0.1046995 109.71034 -0.2436637 0.8079463
Mock * Benzonase -0.0782140 0.1023982 109.71034 -0.7638225 0.4466126
BAL * Benzonase -0.0934606 0.1046995 109.71034 -0.8926556 0.3739954
Nasal * Benzonase -0.2844798 0.1003189 115.63457 -2.8357539 0.0053985
Sputum * Benzonase -0.1445493 0.1046995 109.71034 -1.3806103 0.1702051
Mock * Host zero -0.0466232 0.1023982 109.71034 -0.4553132 0.6497838
BAL * Host zero -0.2339486 0.1046995 109.71034 -2.2344755 0.0274786
Nasal * Host zero -0.7898081 0.1003189 115.63457 -7.8729711 0.0000000
Sputum * Host zero -0.5061958 0.1046995 109.71034 -4.8347467 0.0000044
Mock * Molysis -0.0301512 0.1023982 109.71034 -0.2944502 0.7689703
BAL * Molysis -0.2110097 0.1046995 109.71034 -2.0153834 0.0463093
Nasal * Molysis -0.5388071 0.1001256 114.99868 -5.3813128 0.0000004
Sputum * Molysis -0.7303645 0.1046995 109.71034 -6.9758137 0.0000000
Mock * QIAamp -0.0085052 0.1023982 109.71034 -0.0830597 0.9339554
BAL * QIAamp -0.0749318 0.1046995 109.71034 -0.7156840 0.4757069
Nasal * QIAamp -0.7638065 0.1003189 115.63457 -7.6137819 0.0000000
Sputum * QIAamp -0.1992210 0.1046995 109.71034 -1.9027875 0.0596896

Gram negatives - ANOVA

Which methods was changed gram-strain ratio?

Square root transformation was required

Interaction term was significant

hist(sample_data %>% data.frame %>% .$gram_neg)

hist(sample_data %>% data.frame %>% .$gram_neg %>% sqrt())

lmer(sqrt(gram_neg) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        anova %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*", .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>% column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.2519926 0.0629982 4 15.27912 3.171971 0.0440715
treatment 2.0092804 0.4018561 5 110.53768 20.233539 0.0000000
sample_type * treatment 3.6648456 0.1832423 20 109.68797 9.226288 0.0000000

Gram negatives

Which method biased gram positive-negative ratio

lmer( Gram-negative ratio vs sample_type + treatment + sample_type * treatment + (1|subject_id) )

Some treatment (commercial) changed gram negative proportion

lmer(sqrt(gram_neg) ~ sample_type + treatment + sample_type * treatment + (1|subject_id), data = sample_data %>% data.frame) %>%
        summary %>% .$coefficients %>% data.frame(check.names = F) %>%
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*", .default = " ")) %>%
        rownames_to_column(var = "x") %>% mutate(x = gsub("sample_type|treatment", "", x))  %>% mutate(x = gsub(":", " * ", x)) %>% 
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>% kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.6047523 0.1991381 16.03891 3.0368496 0.0078347
Mock 0.1893684 0.2816237 16.03891 0.6724164 0.5108903
BAL 0.0050293 0.2282735 19.19085 0.0220321 0.9826498
Nasal -0.4881059 0.2127831 17.27815 -2.2939131 0.0345859
Sputum 0.2582871 0.2256043 18.34006 1.1448676 0.2669856
lyPMA -0.0837507 0.0853365 108.50402 -0.9814169 0.3285704
Benzonase -0.2799287 0.0853365 108.50402 -3.2802909 0.0013947
Host zero 0.1510556 0.0853365 108.50402 1.7701162 0.0795165
Molysis 0.1211041 0.0853365 108.50402 1.4191349 0.1587260
QIAamp -0.0908688 0.0853365 108.50402 -1.0648287 0.2893173
Mock * lyPMA -0.0896218 0.1206841 108.50402 -0.7426149 0.4593196
BAL * lyPMA 0.1732270 0.1311974 108.50402 1.3203533 0.1894955
Nasal * lyPMA 0.6106762 0.1187428 112.04041 5.1428470 0.0000012
Sputum * lyPMA -0.2343407 0.1233964 108.50402 -1.8990876 0.0602088
Mock * Benzonase -0.3638032 0.1206841 108.50402 -3.0145078 0.0032051
BAL * Benzonase 0.3259099 0.1282113 108.88628 2.5419740 0.0124295
Nasal * Benzonase 0.3421773 0.1190735 112.52671 2.8736635 0.0048518
Sputum * Benzonase -0.1759531 0.1233964 108.50402 -1.4259168 0.1567631
Mock * Host zero -0.7891645 0.1206841 108.50402 -6.5390923 0.0000000
BAL * Host zero -0.1973889 0.1282113 108.88628 -1.5395587 0.1265691
Nasal * Host zero -0.1470414 0.1190735 112.52671 -1.2348791 0.2194478
Sputum * Host zero -0.7742055 0.1233964 108.50402 -6.2741316 0.0000000
Mock * Molysis -0.7308145 0.1206841 108.50402 -6.0555990 0.0000000
BAL * Molysis -0.1665528 0.1282113 108.88628 -1.2990492 0.1966711
Nasal * Molysis -0.0101193 0.1187428 112.04041 -0.0852206 0.9322381
Sputum * Molysis -0.7649241 0.1233964 108.50402 -6.1989155 0.0000000
Mock * QIAamp -0.5198421 0.1206841 108.50402 -4.3074614 0.0000364
BAL * QIAamp 0.0754132 0.1282113 108.88628 0.5881948 0.5576196
Nasal * QIAamp 0.1199762 0.1190735 112.52671 1.0075804 0.3158177
Sputum * QIAamp -0.5568238 0.1233964 108.50402 -4.5124785 0.0000163

Results

1. Library failure was associated with Nasal, especially after lyPMA and Molysis treatment

2. Benzonase, host-zero, Molysis, and QIAamp increased final reads

3. Host-zero lowered host %. For otheres, there were significant sample_type specific treatment efficiencies

A3. LM of taxa alpha diversity

Alpha diversity could be having changes due to treatment.

Both stratified and nonstratified analyses were conducted.

Figure - Alpha diversity

sample_data <- sample_data(phyloseq$phyloseq_count)
f4a <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL", "Mock", "Neg.")), aes(y = S.obs)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Species richness") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, nrow = 1) + 
        guides(fill = guide_legend(nrow = 1))

f4b <-        ggplot(subset(sample_data(phyloseq$phyloseq_count), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL",  "Mock", "Neg.")), aes(y = data_invsimpson)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Inverse simpson") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "B") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, nrow = 1) + 
        guides(fill = guide_legend(nrow = 1))



ggarrange(f4a, f4b, common.legend = T, align = "hv", ncol = 1) # alpha diversity plots

Species richness

All samples:

S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

S.obs ~ sample_type + log10 (Final_reads) + (1|original_sample)

Species richness (all samples & interaction term) - ANOVA

Interaction term was significant

sample_data <- sample_data(phyloseq$phyloseq_count) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))

lmer_sob <- lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample), data = sample_data)
lmer_sob %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 9960.852 2490.213 4 18.45264 33.245597 0.0000000
treatment 1528.150 305.630 5 30.68974 4.080314 0.0058869
log10(Final_reads) 3981.388 3981.388 1 114.44646 53.153537 0.0000000
sample_type:treatment 21131.669 1056.583 20 51.28486 14.105921 0.0000000

Species richness (all samples & interaction term)

Increase at sputum was at every treatment Postive and negative control showed no changes

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -80.047738 17.008298 53.43860 -4.7063933 0.0000183
Mock 1.734738 17.593398 18.71396 0.0986016 0.9225031
BAL 4.860057 13.888167 20.21759 0.3499423 0.7300011
Nasal -6.274752 12.973679 18.37580 -0.4836525 0.6343457
Sputum 9.214633 13.693099 19.16482 0.6729399 0.5090194
lyPMA -2.635369 5.253967 108.47693 -0.5015961 0.6169682
Benzonase -2.043958 5.204004 108.30975 -0.3927664 0.6952640
Host zero -3.168911 5.227009 108.38739 -0.6062569 0.5456118
Molysis -2.139510 5.210088 108.33039 -0.4106476 0.6821421
QIAamp -1.000382 5.212165 108.33742 -0.1919321 0.8481547
log10(Final_reads) 14.902887 2.030006 116.27333 7.3413036 0.0000000
Mock * lyPMA 12.737965 7.706230 109.08208 1.6529438 0.1012177
BAL * lyPMA -1.349276 7.998293 108.30443 -0.1686955 0.8663508
Nasal * lyPMA 5.285949 7.471377 112.12777 0.7074932 0.4807279
Sputum * lyPMA 32.986694 7.530978 108.32384 4.3801343 0.0000275
Mock * Benzonase -13.666059 7.358917 108.30817 -1.8570748 0.0660176
BAL * Benzonase -4.719308 7.985511 109.21176 -0.5909838 0.5557524
Nasal * Benzonase -1.690600 7.260902 112.19270 -0.2328361 0.8163129
Sputum * Benzonase 57.838047 7.686374 108.67609 7.5247507 0.0000000
Mock * Host zero -10.603124 7.370691 108.33641 -1.4385522 0.1531601
BAL * Host zero -1.270111 7.967847 109.16695 -0.1594045 0.8736447
Nasal * Host zero 3.419015 7.369171 112.51396 0.4639620 0.6435714
Sputum * Host zero 89.231452 8.057689 109.42878 11.0740750 0.0000000
Mock * Molysis -9.509063 7.364202 108.32087 -1.2912549 0.1993641
BAL * Molysis 10.224601 8.054150 109.37844 1.2694823 0.2069635
Nasal * Molysis 4.816033 7.239822 111.70287 0.6652143 0.5072846
Sputum * Molysis 95.881206 8.406598 110.03853 11.4054712 0.0000000
Mock * QIAamp -15.049280 7.357910 108.30575 -2.0453200 0.0432470
BAL * QIAamp -4.097753 8.051603 109.37244 -0.5089364 0.6118220
Nasal * QIAamp -7.457621 7.469458 112.08778 -0.9984152 0.3202286
Sputum * QIAamp 65.901043 7.943329 109.20933 8.2964015 0.0000000

Species richness - stratified (Pos + Neg)

No treatment increased species richenss - after adjusting sequencing depth. With mock community except lyPMA, treatments showed they even reduced the possible contaminants. Need to observe alpha diversity of positive controls

lm(S.obs ~ sample_type * treatment + log10 (Final_reads), data = subset(sample_data, sample_data$sample_type == "Neg." | sample_data$sample_type == "Mock" )) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error t value Pr(>|t|)
(Intercept) -77.260345 7.499432 -10.3021600 0.0000000
Mock 2.710544 3.274859 0.8276826 0.4118633
lyPMA -2.464028 2.166841 -1.1371523 0.2610050
Benzonase -2.011903 2.120747 -0.9486763 0.3474409
Host zero -3.050426 2.142040 -1.4240754 0.1607621
Molysis -2.072706 2.126390 -0.9747538 0.3344702
QIAamp -0.925304 2.128315 -0.4347590 0.6656461
log10(Final_reads) 14.430200 1.248465 11.5583574 0.0000000
Mock * lyPMA 12.203902 3.312177 3.6845556 0.0005722
Mock * Benzonase -13.705197 2.998581 -4.5705604 0.0000331
Mock * Host zero -10.707694 3.009499 -3.5579653 0.0008412
Mock * Molysis -9.584898 3.003485 -3.1912586 0.0024731
Mock * QIAamp -15.076263 2.997646 -5.0293668 0.0000070

Species richness - stratified (NS + Pos + Neg)

Molysis and host zero may incrased speciess richness of Nasal Data include nasal swab, positive depletion, and negative depletion

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -61.5968651 7.141345 45.200273 -8.6253873 0.0000000
Mock 8.1939894 5.861474 8.818737 1.3979400 0.1962941
Nasal -4.3344907 4.238467 8.089634 -1.0226553 0.3360766
lyPMA -1.5011967 2.229134 69.324661 -0.6734439 0.5029040
Benzonase -1.8317743 2.199470 69.068228 -0.8328252 0.4078140
Host zero -2.3846158 2.213143 69.187994 -1.0774792 0.2850087
Molysis -1.6973122 2.203088 69.100187 -0.7704240 0.4436746
QIAamp -0.5034133 2.204324 69.111052 -0.2283754 0.8200288
log10(Final_reads) 11.7739846 1.017878 76.540074 11.5671904 0.0000000
Mock * lyPMA 9.2027877 3.314878 70.208146 2.7762074 0.0070450
Nasal * lyPMA 2.4311825 3.179741 72.982926 0.7645850 0.4469838
Mock * Benzonase -13.9251284 3.110131 69.065784 -4.4773450 0.0000291
Nasal * Benzonase -1.2838446 3.049614 72.822187 -0.4209859 0.6750053
Mock * Host zero -11.2953122 3.117133 69.109488 -3.6236226 0.0005505
Nasal * Host zero 5.4823784 3.112760 73.257010 1.7612597 0.0823672
Mock * Molysis -10.0110464 3.113275 69.085451 -3.2156003 0.0019810
Nasal * Molysis 5.0599707 3.043042 72.398920 1.6628002 0.1006761
Mock * QIAamp -15.2278909 3.109532 69.062027 -4.8971656 0.0000062
Nasal * QIAamp -4.8379307 3.176346 72.738691 -1.5231120 0.1320667

Species richness (BAL + Pos + Neg)

No changes observed

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "BAL"  | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -86.9419958 14.195115 13.415935 -6.1247830 0.0000317
Mock -0.6787926 14.953730 4.410950 -0.0453929 0.9657761
BAL 5.4704776 11.638199 4.497464 0.4700450 0.6602220
lyPMA -3.0591579 3.567716 67.090622 -0.8574556 0.3942459
Benzonase -2.1232410 3.518550 67.045378 -0.6034420 0.5482509
Host zero -3.4619661 3.541215 67.066496 -0.9776209 0.3317752
Molysis -2.3047390 3.524548 67.051011 -0.6539104 0.5154060
QIAamp -1.1860761 3.526596 67.052926 -0.3363232 0.7376774
log10(Final_reads) 16.0720161 1.657634 68.551578 9.6957548 0.0000000
Mock * lyPMA 14.0589008 5.314393 67.247488 2.6454390 0.0101480
BAL * lyPMA -1.3949813 5.407089 67.043923 -0.2579912 0.7972040
Mock * Benzonase -13.5692567 4.975335 67.044947 -2.7273052 0.0081427
BAL * Benzonase -5.7266255 5.453212 67.394364 -1.0501381 0.2974058
Mock * Host zero -10.3444846 4.986942 67.052650 -2.0743140 0.0418926
BAL * Host zero -2.2264993 5.435884 67.380448 -0.4095929 0.6834038
Mock * Molysis -9.3214938 4.980547 67.048413 -1.8715804 0.0656278
BAL * Molysis 9.0402931 5.520357 67.444404 1.6376285 0.1061570
Mock * QIAamp -14.9825410 4.974342 67.044285 -3.0119646 0.0036587
BAL * QIAamp -5.2759699 5.517870 67.442644 -0.9561607 0.3424060

Species richness (sputum + Pos + Neg)

Benzonase may incrased speciess richness of sputum

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum"  | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -92.3236354 28.562823 13.478941 -3.2323008 0.0062829
Mock -0.7292052 30.205870 4.419012 -0.0241412 0.9817914
Sputum 10.5636183 23.206927 4.277837 0.4551925 0.6711239
lyPMA -2.0980392 5.952543 67.005568 -0.3524610 0.7255988
Benzonase -0.9540210 5.908482 67.002525 -0.1614663 0.8722120
Host zero -2.4218768 5.917865 67.003178 -0.4092484 0.6836627
Molysis -1.1874354 5.905160 67.002292 -0.2010844 0.8412418
QIAamp -0.0811347 29.557837 4.053008 -0.0027449 0.9979397
log10(Final_reads) 16.7782375 3.230107 67.206668 5.1943292 0.0000021
Mock * lyPMA 13.4493235 30.311724 4.481516 0.4437004 0.6778513
Sputum * lyPMA 31.4365340 8.409957 67.005171 3.7380137 0.0003861
Mock * Benzonase -14.9182810 30.142296 4.382641 -0.4949285 0.6444466
Sputum * Benzonase 55.1618069 8.851291 67.024952 6.2320634 0.0000000
Mock * Host zero -11.5957501 30.142990 4.383043 -0.3846914 0.7184121
Sputum * Host zero 85.3463367 9.741528 67.056899 8.7610831 0.0000000
Mock * Molysis -10.6156900 30.141868 4.382393 -0.3521908 0.7409875
Sputum * Molysis 91.1969876 10.516606 67.078308 8.6717129 0.0000000
Mock * QIAamp -16.3497254 30.142897 4.382989 -0.5424072 0.6139508
Sputum * QIAamp 62.3266963 30.472969 4.577081 2.0453109 0.1014241

Simpson

Inverse Simpson of all samples:

Inverse Simpson ~ sample_type * treatment + log10(Final_reads) + (1|original_sample)

Stratified:

Inverse Simpson ~ treatment + (1|original_sample)

Inv Simp - ANOVA

Final reads did not affect inverse Simpson

lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + log10(Final_reads) + (1|subject_id), data = sample_data)

lmer_invsimpson %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 89.193944 22.298486 4 16.14294 5.8258848 0.0042571
treatment 45.186056 9.037211 5 112.77039 2.3611357 0.0444302
log10(Final_reads) 1.333353 1.333353 1 120.16332 0.3483626 0.5561498
sample_type * treatment 236.399387 11.819969 20 110.50529 3.0881819 0.0000855
lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = sample_data)

Simpson (all samples & interaction term)

Sputum after treatment showed differences - stratified analysis is required

#Simpson

lmer_invsimpson %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.1596359 2.110581 16.55212 1.4970458 0.1532070
Mock 2.1313077 2.984812 16.55212 0.7140510 0.4851440
BAL -0.4036662 2.491790 22.12652 -0.1619985 0.8727773
Nasal -1.0199272 2.284181 18.73857 -0.4465177 0.6603407
Sputum -0.2566351 2.445488 20.64163 -0.1049423 0.9174347
lyPMA -0.6047497 1.181903 109.86264 -0.5116745 0.6099058
Benzonase -0.4369252 1.181903 109.86264 -0.3696794 0.7123330
Host zero -0.4071159 1.181903 109.86264 -0.3444579 0.7311604
Molysis -0.2823875 1.181903 109.86264 -0.2389261 0.8116080
QIAamp 0.5267480 1.181903 109.86264 0.4456778 0.6567070
Mock * lyPMA 3.0329696 1.671463 109.86264 1.8145593 0.0723204
BAL * lyPMA 0.3912303 1.817072 109.86264 0.2153080 0.8299264
Nasal * lyPMA 0.5572614 1.636030 114.30731 0.3406181 0.7340163
Sputum * lyPMA 4.1940580 1.709029 109.86264 2.4540589 0.0156959
Mock * Benzonase 0.6013923 1.671463 109.86264 0.3597999 0.7196875
BAL * Benzonase 0.0152940 1.774650 110.45106 0.0086180 0.9931395
Nasal * Benzonase 0.8370388 1.639407 114.85265 0.5105741 0.6106292
Sputum * Benzonase 8.7134085 1.709029 109.86264 5.0984553 0.0000014
Mock * Host zero -0.7335174 1.671463 109.86264 -0.4388474 0.6616340
BAL * Host zero -0.6540204 1.774650 110.45106 -0.3685349 0.7131799
Nasal * Host zero 0.5651256 1.639407 114.85265 0.3447134 0.7309403
Sputum * Host zero 6.6176648 1.709029 109.86264 3.8721780 0.0001836
Mock * Molysis -0.7440630 1.671463 109.86264 -0.4451566 0.6570825
BAL * Molysis 2.6186628 1.774650 110.45106 1.4755938 0.1428981
Nasal * Molysis 0.9316357 1.636030 114.30731 0.5694490 0.5701688
Sputum * Molysis 6.9900959 1.709029 109.86264 4.0900976 0.0000824
Mock * QIAamp -1.4438502 1.671463 109.86264 -0.8638240 0.3895666
BAL * QIAamp -1.3151584 1.774650 110.45106 -0.7410804 0.4602174
Nasal * QIAamp -0.7602948 1.639407 114.85265 -0.4637621 0.6436959
Sputum * QIAamp 4.3874377 1.709029 109.86264 2.5672107 0.0115965

Inverse Simpson - stratified (Untreateds)

Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)

Mock community treated with lyPMA showed cahnges in alpha diveresity

lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.1596359 1.1263953 50 2.8050862 0.0071468
Mock 2.1313077 1.5929635 50 1.3379514 0.1869658
lyPMA -0.6047497 0.7881973 50 -0.7672567 0.4465369
Benzonase -0.4369252 0.7881973 50 -0.5543348 0.5818208
Host zero -0.4071159 0.7881973 50 -0.5165152 0.6077717
Molysis -0.2823875 0.7881973 50 -0.3582700 0.7216495
QIAamp 0.5267480 0.7881973 50 0.6682946 0.5070191
Mock * lyPMA 3.0329696 1.1146793 50 2.7209346 0.0089329
Mock * Benzonase 0.6013923 1.1146793 50 0.5395205 0.5919225
Mock * Host zero -0.7335174 1.1146793 50 -0.6580524 0.5135230
Mock * Molysis -0.7440630 1.1146793 50 -0.6675130 0.5075138
Mock * QIAamp -1.4438502 1.1146793 50 -1.2953054 0.2011637

Inverse Simpson - stratified (NS + Pos + Neg)

Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)

Mock community treated with lyPMA only showed changes in alpha diveresity.

lmer(data_invsimpson ~ sample_type * treatment + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal" | sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.1596359 0.5855932 4.631504 5.3956162 0.0037204
Mock 2.1313077 0.8281538 4.631504 2.5735652 0.0535708
Nasal -1.0199272 0.6972692 7.720477 -1.4627452 0.1830195
lyPMA -0.6047497 0.6924462 71.894032 -0.8733527 0.3853787
Benzonase -0.4369252 0.6924462 71.894032 -0.6309880 0.5300493
Host zero -0.4071159 0.6924462 71.894032 -0.5879387 0.5584158
Molysis -0.2823875 0.6924462 71.894032 -0.4078114 0.6846240
QIAamp 0.5267480 0.6924462 71.894032 0.7607061 0.4493204
Mock * lyPMA 3.0329696 0.9792668 71.894032 3.0971842 0.0027859
Nasal * lyPMA 0.4293667 0.9393367 76.139565 0.4570957 0.6489038
Mock * Benzonase 0.6013923 0.9792668 71.894032 0.6141251 0.5410712
Nasal * Benzonase 0.8944160 0.9394703 76.292675 0.9520429 0.3440818
Mock * Host zero -0.7335174 0.9792668 71.894032 -0.7490476 0.4562724
Nasal * Host zero 0.6225028 0.9394703 76.292675 0.6626104 0.5095770
Mock * Molysis -0.7440630 0.9792668 71.894032 -0.7598164 0.4498488
Nasal * Molysis 1.0595304 0.9393367 76.139565 1.1279559 0.2628804
Mock * QIAamp -1.4438502 0.9792668 71.894032 -1.4744196 0.1447343
Nasal * QIAamp -0.8176720 0.9394703 76.292675 -0.8703543 0.3868371

Inverse Simpson - stratified (BAL + Pos + Neg)

Nothing changed in BAL

lmer(data_invsimpson ~ sample_type * treatment + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL" |sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 3.5745359 1.301083 3.746913 2.7473533 0.0553671
Mock 1.5373505 1.840010 3.746913 0.8355121 0.4533865
BAL -0.9457293 1.660635 6.770497 -0.5694987 0.5874053
lyPMA -1.0196497 1.157423 65.468387 -0.8809656 0.3815592
Benzonase -0.8518252 1.157423 65.468387 -0.7359672 0.4643795
Host zero -0.8220158 1.157423 65.468387 -0.7102123 0.4800948
Molysis -0.6972874 1.157423 65.468387 -0.6024484 0.5489568
QIAamp 0.1118481 1.840010 3.746913 0.0607867 0.9546317
Mock * lyPMA 3.6269268 2.173767 7.266306 1.6684983 0.1375765
BAL * lyPMA 0.8061302 1.736134 65.468387 0.4643249 0.6439554
Mock * Benzonase 1.1953495 2.173767 7.266306 0.5498977 0.5988779
BAL * Benzonase 0.5573570 1.694028 66.295151 0.3290129 0.7431831
Mock * Host zero -0.1395602 2.173767 7.266306 -0.0642020 0.9505401
BAL * Host zero -0.1119574 1.694028 66.295151 -0.0660895 0.9475055
Mock * Molysis -0.1501057 2.173767 7.266306 -0.0690533 0.9468093
BAL * Molysis 3.1607258 1.694028 66.295151 1.8658053 0.0664936
Mock * QIAamp -0.8498929 2.173767 7.266306 -0.3909770 0.7070259
BAL * QIAamp -0.7730954 2.217147 7.770971 -0.3486894 0.7365808

Inverse Simpson - stratified (spt + Pos + Neg)

Sputum changed after some treatment - but their changes were not treatment global.

lmer(data_invsimpson ~ sample_type * treatment + log10(Final_reads) + + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum" |sample_data$sample_type == "Mock" | sample_data$sample_type == "Neg.")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -1.8297432 5.6033292 16.558650 -0.3265457 0.7481026
Mock -0.1903964 5.6258760 4.532445 -0.0338430 0.9744414
Sputum -0.4917756 4.3134952 4.352519 -0.1140086 0.9142991
lyPMA -1.2279698 1.2388830 67.006745 -0.9911911 0.3251582
Benzonase -0.7959257 1.2297164 67.002900 -0.6472433 0.5196859
Host zero -0.9300696 1.2316684 67.003726 -0.7551299 0.4528173
Molysis -0.7073044 1.2290252 67.002606 -0.5755003 0.5668808
QIAamp 0.0861351 5.4748103 4.066784 0.0157330 0.9881891
log10(Final_reads) 0.8966675 0.6721363 67.260263 1.3340561 0.1866850
Mock * lyPMA 4.5167048 5.6505287 4.612797 0.7993420 0.4632439
Sputum * lyPMA 4.3330105 1.7503369 67.006243 2.4755294 0.0158384
Mock * Benzonase 1.1462745 5.6111475 4.486335 0.2042852 0.8470693
Sputum * Benzonase 8.3139445 1.8421544 67.031236 4.5131639 0.0000265
Mock * Host zero -0.0645138 5.6113089 4.486848 -0.0114971 0.9913202
Sputum * Host zero 5.6401972 2.0273686 67.071576 2.7820285 0.0070083
Mock * Molysis -0.1295673 5.6110480 4.486019 -0.0230915 0.9825686
Sputum * Molysis 5.6305507 2.1886282 67.098590 2.5726392 0.0123107
Mock * QIAamp -0.9220254 5.6112874 4.486780 -0.1643162 0.8766238
Sputum * QIAamp 3.5585589 5.6879574 4.735317 0.6256304 0.5604643

*** Results: ***

3.1. Species richness - type * method specific. Sputum showed the highest changes, in every methods

3.2. Stratified analysis showed that some methods increased some alpha diversity indices. Changes were highest at sputum. However, stratified analysis showed Benzonase was the only one showed significant changes.

A4. Taxa beta diversity

Permanova (Taxa dist ~ log10(final reads) + sample_type + treatment + sample_type * treatment + subject_id) –> both stratified and nonstratified

Beta diversity figures

phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_rel, S.obs != 0 & sample_type %in% c("BAL", "Nasal", "Sputum", "Mock", "Neg."))

bray_perm_uni <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment + subject_id,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), permutations = 10000)


bray_perm_uni_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)



bray_perm_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>%
                                    .$subject_id, permutations = 10000)

bray_perm_inter <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type * treatment + log10(Final_reads),
                                  data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), 
                                  strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>%
                                          .$subject_id,
                                  permutations = 10000)


bray_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                               data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
                                       sample_data %>% data.frame(check.names = F),
                               strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>% 
                                       sample_data %>% data.frame(check.names = F) %>%
                                       .$subject_id, permutations = 10000)

bray_perm_bal  <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="bray") ~  lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                 data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
                                         sample_data %>% data.frame(check.names = F),
                                 strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
                                         sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
                                %>% sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
                                %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_pos <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz,
                                                            sample_type == "Mock"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz,
                                sample_type == "Mock") %>% sample_data %>%
                                        data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz,
                                sample_type == "Mock") %>%
                                        sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


bray_perm_neg <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz,
                                                            sample_type == "Neg."), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz,
                                sample_type == "Neg.") %>% sample_data %>%
                                        data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz,
                                sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


bray_perm_ns_ctrl <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz,
                                sample_type == "Nasal" | sample_type == "Mock" | sample_type == "Neg."), method="bray") ~ sample_type + lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz,
                                sample_type == "Nasal" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz,
                                sample_type == "Nasal" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_bal_ctrl <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz,
                                sample_type == "BAL" | sample_type == "Mock" | sample_type == "Neg."), method="bray") ~ sample_type + lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz,
                                sample_type == "BAL" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz,
                                sample_type == "BAL" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_spt_ctrl <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz,
                                sample_type == "Sputum" | sample_type == "Mock" | sample_type == "Neg."), method="bray") ~ sample_type + lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz,
                                sample_type == "Sputum" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz,
                                sample_type == "Sputum" | sample_type == "Mock" | sample_type == "Neg.") %>%
                                        sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

PCoA based on Bray-Curtis (all samples)

Based on distances, it seems like some of the negative controls were affected by some samples. Meanwhile, Positive controls (mock community) were close to BAL samples.

ordinate(phyloseq_rel_nz,  method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_rel_nz, ., col = "treatment", shape = "sample_type" ) + 
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_shape(name = "Sample type", labels = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm")) + #legend.position = c(0.9, 0.4)
        labs(tag = "E")

PCoA based on Jaccard (all samples)

Jaccard dissimilarities (presenece and absence) showed BAL and Mock communities are distant. Some samples may have some overlaps

ordinate(phyloseq_rel_nz,  method = "PCoA", distance = "jaccard") %>%
        plot_ordination(phyloseq_rel_nz, ., col = "treatment", shape = "sample_type" ) + 
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_shape(name = "Sample type", labels = c("BAL", "Nasal", "Sputum", "Mock", "Neg.")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm")) + #legend.position = c(0.9, 0.4)
        labs(tag = "E")

Stratified Bray beta diversity (Mock)

Some treatment made samples distant to the theoretical composition

ordinate(subset_samples(phyloseq_control_rel, sample_type != "Neg."), method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_control_rel, ., col = "treatment") +
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Mock theoretical", "Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("black", "#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"),
                           name = "Treatment",
                           breaks = c("-", "Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
                           labels = c("Mock theoretical", "Untreated", "lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        #scale_shape(name = "Sample type", labels = c("Mock theoretical", "Mock")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm"))  #legend.position = c(0.9, 0.4)

        #labs(tag = "E")

Stratified Jaccard beta diversity (Mock)

Some treatment made samples distant to the theoretical composition

ordinate(subset_samples(phyloseq_control_rel, sample_type != "Neg."), method = "PCoA", distance = "jaccard") %>%
        plot_ordination(phyloseq_control_rel, ., col = "treatment") +
        scale_color_manual(values = c("black", "#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"),
                           name = "Treatment",
                           breaks = c("-", "Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
                           labels = c("Mock theoretical", "Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        #scale_shape(name = "Sample type", labels = c("Mock theoretical", "Mock")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm"))  #legend.position = c(0.9, 0.4)

        #labs(tag = "E")

Bar plot of mock community

Some bugs greatly decreased after some treatment

Differential abundance analysis should be conducted.

#Manipulating phyloseq - only top 10 
phyloseq_control_rel%>%
        subset_samples(sample_type != "Neg.") %>%
        tax_table(.) %>%
        cbind(species20 = "[Others]") %>%
        {top20species <- head(taxa_sums(phyloseq_control_rel%>%
        subset_samples(sample_type != "Neg.") ) %>%
                                data.frame %>%
                                arrange(-.) %>%
                                row.names(), 20)
   .[top20species, "species20"] <- as.character(.[top20species, "Species"])
   .[, 8] <- .[, 8] %>% gsub("s__", "", .) %>% gsub("_", " ", .) %>% paste("<i>", ., "</i>", sep = "")
   phyloseq_temp <- phyloseq_control_rel%>%
        subset_samples(sample_type != "Neg.") 
   tax_table(phyloseq_temp) <- tax_table(.) 
   phyloseq_temp
  } %>%
        plot_bar(., fill="species20") + 
        ylab("Relative abundancne") +
        theme_classic(base_size = 11, base_family = "serif") +
        ggtitle("Bar plot of positive controls") +
        theme(legend.text = element_markdown()) +
        guides(fill=guide_legend(title="Top 20 species")) +
        facet_wrap (~ treatment, scales= "free_x", nrow=1)

#there could be opportunistic pathogens...

Stratified Bray beta diversity (negative)

However, it seems there is no differences between negative controls..

ordinate(subset_samples(phyloseq_control_rel, sample_type == "Neg."), method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_control_rel, ., col = "treatment") +
        scale_color_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"),
                           name = "Treatment",
                           breaks = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAamp"),
                           labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        #scale_shape(name = "Sample type", labels = c("Mock theoretical", "Mock")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm"))  #legend.position = c(0.9, 0.4)

        #labs(tag = "E")

Beta diversity boxplot

Distances between samples within each subject. Mean distance between control <-> treatment for each subject

#distances of betadiversity - boxplots
bray_dist_long <- distance(phyloseq_rel_nz, method="bray") %>% as.matrix() %>% melt_dist() #making long data of distance matrices

#Adding sample type and treatment name. 
#this can be also done by merging metadata into the `bray_dist_long`
names <- data.frame(str_split_fixed(bray_dist_long$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(bray_dist_long$iso2, "_", 3))
bray_dist_long$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
bray_dist_long$method_1 <- ifelse(grepl("lyPMA", bray_dist_long$iso1),"lypma", 
                                         ifelse(grepl("ben", bray_dist_long$iso1),"benzonase", 
                                                ifelse(grepl("host", bray_dist_long$iso1),"host_zero", 
                                                       ifelse(grepl("qia", bray_dist_long$iso1),"qiaamp", 
                                                              ifelse(grepl("moly", bray_dist_long$iso1),"molysis", 
                                                                     "control")))))


#Adding data for iso 2 also should be done
bray_dist_long$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
bray_dist_long$method_2 <-ifelse(grepl("lyPMA", bray_dist_long$iso2),"lypma", 
                                        ifelse(grepl("ben", bray_dist_long$iso2),"benzonase", 
                                               ifelse(grepl("host", bray_dist_long$iso2),"host_zero", 
                                                      ifelse(grepl("qia", bray_dist_long$iso2),"qiaamp", 
                                                             ifelse(grepl("moly", bray_dist_long$iso2),"molysis", 
                                                                    "control")))))


#subsetting distances of my interest
bray_dist_long$sample_id_1 <- ifelse(grepl("pos", bray_dist_long$sample_id_1, ignore.case = T),"Mock", 
                                 ifelse(grepl("neg|n_", bray_dist_long$sample_id_1, ignore.case = T),"Neg.",
                                        bray_dist_long$sample_id_1))
bray_dist_long$sample_id_2 <- ifelse(grepl("pos", bray_dist_long$sample_id_2, ignore.case = T),"Mock", 
                                 ifelse(grepl("neg|n_", bray_dist_long$sample_id_2, ignore.case = T),"Neg.",
                                        bray_dist_long$sample_id_2))


path_bray_dist_long_within_sampleid_from_control <- subset(bray_dist_long, bray_dist_long$sample_id_1 == bray_dist_long$sample_id_2) # data within samples

path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid_from_control,
                                                           path_bray_dist_long_within_sampleid_from_control$method_1 != path_bray_dist_long_within_sampleid_from_control$method_2) # remove irrelevant association

path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid_from_control,
                                                           path_bray_dist_long_within_sampleid_from_control$method_1 != path_bray_dist_long_within_sampleid_from_control$method_2) # remove irrelevant association

path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid_from_control, (path_bray_dist_long_within_sampleid_from_control$method_1 == "control") + (path_bray_dist_long_within_sampleid_from_control$method_2 == "control") != 0)

path_bray_dist_long_within_sampleid_from_control$treatment <- path_bray_dist_long_within_sampleid_from_control$method_1

path_bray_dist_long_within_sampleid_from_control$treatment <- ifelse(path_bray_dist_long_within_sampleid_from_control$treatment == "control", path_bray_dist_long_within_sampleid_from_control$method_2, path_bray_dist_long_within_sampleid_from_control$treatment) #Setting key method



path_bray_dist_long_within_sampleid_from_control$sample_type <- ifelse(grepl("NS", path_bray_dist_long_within_sampleid_from_control$iso1), "Nasal",
                                                                  ifelse(grepl("CFB", path_bray_dist_long_within_sampleid_from_control$iso1), "Sputum",
                                                                         ifelse(grepl("BAL", path_bray_dist_long_within_sampleid_from_control$iso1), "BAL",
                                                                                ifelse(grepl("pos|POS", path_bray_dist_long_within_sampleid_from_control$iso1), "Mock",
                                                                                       ifelse(grepl("neg|N_EXT", path_bray_dist_long_within_sampleid_from_control$iso1), "Neg.",NA)))))

path_bray_dist_long_within_sampleid_from_control %>%
        mutate(across(sample_type, factor, levels=c( "Neg.", "Mock", "BAL", "Nasal","Sputum"))) %>%
ggplot(aes(y = dist, fill = treatment)) +
        geom_boxplot() +
        #scale_fill_manual(values = c(viridis(6)[2:6])) +
        scale_fill_manual(values = c("#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Sample pair distances") +
        theme_classic (base_size = 12, base_family = "serif") + 
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, ncol = 5)

PERMANOVA test results

Subject as fixed effect vs strata term

Subject as fixed effect

adonis (dist ~ sample_type + log10(Final_reads) + treated + subject)

With strata

adonis (dist ~ sample_type + log10(Final_reads) + treated, strata = subject)

Strata term was employed rather than fixed effect

bray_perm_uni %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as fixed effect") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as fixed effect
Df SumOfSqs R2 F Pr(>F)
Sample type 4 28.047 0.462 63.093 0
log10(Final reads) 1 1.268 0.021 11.410 0
Treatment 5 1.885 0.031 3.393 0
Subject 17 15.389 0.254 8.146 0
Residual 127 14.114 0.233 NA NA
Total 154 60.703 1.000 NA NA
bray_perm_uni_strata %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as strata term") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as strata term
Df SumOfSqs R2 F Pr(>F)
Sample type 4 28.047 0.462 34.223 0
log10(Final reads) 1 1.268 0.021 6.189 0
Treatment 5 1.885 0.031 1.840 0
Residual 144 29.503 0.486 NA NA
Total 154 60.703 1.000 NA NA

Treatment significantly affected the beta-diversity.

Strata term is better representing our study aim.

What type of method affected the community at the most?

PERMANOVA on each treatment

dist ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp, strata = subject

QIAamp showed highest changes. But, it could be sample type specific.

bray_perm_strata %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 4 28.047 0.462 34.223 0.000
log10(Final reads) 1 1.268 0.021 6.189 0.000
lyPMA 1 0.166 0.003 0.810 0.317
Benzonase 1 0.160 0.003 0.783 0.263
Host zero 1 0.299 0.005 1.457 0.026
Molysis 1 0.343 0.006 1.675 0.005
QIAamp 1 0.917 0.015 4.477 0.000
Residual 144 29.503 0.486 NA NA
Total 154 60.703 1.000 NA NA

PERMANOVA with interaction term

dist ~ sample_type * treatment + log10(Final_reads), strata = subject

It was sample type specific. We need stratified analysis

bray_perm_inter %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "sample_type:treatment" ~ 'Sample type * treatment',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 4 28.047 0.462 38.172 0
Treatment 5 1.990 0.033 2.166 0
log10(Final reads) 1 1.164 0.019 6.334 0
Sample type * treatment 20 6.726 0.111 1.831 0
Residual 124 22.777 0.375 NA NA
Total 154 60.703 1.000 NA NA

Stratified (Positive)

Untreateds chanced after treatment; by a lot. Differential abundance analysis should be conducted

bray_perm_pos %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.518 0.155 58.411 0.000
Benzonase 1 0.179 0.054 20.208 0.000
Host zero 1 0.318 0.095 35.839 0.000
Molysis 1 0.640 0.192 72.234 0.000
QIAamp 1 1.457 0.437 164.410 0.000
log10(Final reads) 1 0.013 0.004 1.451 0.234
Residual 24 0.213 0.064 NA NA
Total 30 3.338 1.000 NA NA

Stratified (Neg)

Untreateds chanced after treatment

bray_perm_neg %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.162 0.025 0.974 0.417
Benzonase 1 0.256 0.039 1.540 0.147
Host zero 1 1.089 0.167 6.556 0.000
Molysis 1 0.132 0.020 0.793 0.584
QIAamp 1 0.196 0.030 1.183 0.306
log10(Final reads) 1 0.700 0.107 4.213 0.000
Residual 24 3.987 0.611 NA NA
Total 30 6.522 1.000 NA NA

Stratified (Nasal)

lyPMA affected Nasal samples.

bray_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.730 0.120 4.873 0.000
Benzonase 1 0.191 0.031 1.277 0.101
Host zero 1 0.171 0.028 1.143 0.421
Molysis 1 0.137 0.022 0.914 0.066
QIAamp 1 0.254 0.042 1.694 0.050
log10(Final reads) 1 0.428 0.070 2.861 0.028
Residual 28 4.192 0.687 NA NA
Total 34 6.103 1.000 NA NA

Stratified (BAL)

QIAamp affected BAL samples

bray_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.100 0.010 0.272 0.337
Benzonase 1 0.025 0.003 0.068 0.987
Host zero 1 0.086 0.009 0.235 0.517
Molysis 1 0.085 0.009 0.230 0.573
QIAamp 1 0.229 0.024 0.623 0.005
log10(Final reads) 1 1.482 0.152 4.028 0.022
Residual 21 7.726 0.794 NA NA
Total 27 9.734 1.000 NA NA

Stratified (spt)

Sputum was affected by Molysis and QIAamp.

bray_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.139 0.020 0.633 0.196
Benzonase 1 0.037 0.005 0.170 0.768
Host zero 1 0.171 0.025 0.777 0.130
Molysis 1 0.436 0.063 1.985 0.010
QIAamp 1 0.953 0.137 4.339 0.000
log10(Final reads) 1 0.172 0.025 0.783 0.354
Residual 23 5.052 0.726 NA NA
Total 29 6.960 1.000 NA NA

Results:

4.1. Effect of each treatment on beta-diveristy was sample type specific.

4.2. NS showed no significant change by QIAamp method

4.3. Sputum showed high change after Molysis and QIAamp. However, here, high change may be meaning higher (better) detection efficiencies. Therefore further analysis is required.

Intermediate results

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c("No increase in final reads",
                                                     "No increase in final reads",
                                                     "No increase in final reads"),
                                           Benzonase = c("No decrease in host %",
                                                         "No decrease in host %",
                                                         "No decrease in host %"),
                                           `Host zero` = c(NA,
                                                           NA,
                                                           NA),
                                           Molysis = c("No decrease in host %",
                                                       "High cahnge of failure in library pep",
                                                       NA),
                                           QIAamp = c("No decrease in host %",
                                                      NA,
                                                      "No decrease in host %")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of issues of each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of issues of each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL No increase in final reads No decrease in host % NA No decrease in host % No decrease in host %
Nasal No increase in final reads No decrease in host % NA High cahnge of failure in library pep NA
Sputum No increase in final reads No decrease in host % NA NA No decrease in host %
matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c(NA,
                                                           "Beta changed",
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Richness + InvSimp +"),
                                           `Host zero` = c(NA,
                                                           "Richness + InvSimp + ",
                                                           NA),
                                           Molysis = c(NA,
                                                           "Richness + InvSimp +",
                                                           "Beta changed"),
                                           QIAamp = c("Beta changed",
                                                           NA,
                                                           "Beta  changed")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of community changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of community changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA Beta changed
Nasal Beta changed NA Richness + InvSimp + Richness + InvSimp + NA
Sputum Shannon + Richness + InvSimp + NA Beta changed Beta changed

Some methods were successful in increasing final reads and lowering host DNA%.

We have no idea weather some changes in diversities are due to deeper sequencing or contaminants

Further anlyses on individual taxa are required

A5. DA analysis for taxa, by sample type and treatment

Hypothesis: if a taxon is a contaminant induced by a treatment method, its DA analysis result should be associated with treatment covariate.

Both stratified and nonstratified were conducted.

Looked at other level groups as well - family and genus

Without interaction

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

With interaction

feature ~ log10(final reads) + sample type + treatment + sample type * treatment + (1|subject)

Stratified

feature ~ log10(final reads) + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

MaAsLin settings : log transform, total sum scaling normalization

Results

#DA analysis - MaAslin
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)

#Running MaAslin for all sample without decontam
#for taxa differentially abundant by host depletion method, look to see which ones overlap with potential contaminant taxa

# Maaslin - # # y ~ log(final reads) + sample_type + treatment  -----------

#all samples
fit_data_spt_neg <- read.csv("data/fit_data_spt_neg.csv")
fit_data_bal_neg <- read.csv("data/fit_data_bal_neg.csv")
fit_data_ns_neg <- read.csv("data/fit_data_spt_neg.csv")

MaAslin - volcano plot

Without interaction

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

Most samples are differentially abundant by sample type

#Making significance table for figure
        # Define a function to make species names italicized
species_italic <- function(data) {
  names <- gsub("_", " ", rownames(data))
  names <- gsub("[]]|[[]", "", names)
  names <- gsub(" sp", " sp.", names)
  names <- gsub(" sp.", "* sp.", names)
  names <- gsub(" group", "* group.", names)
  names <- ifelse(grepl("[*]", names), paste("*", names, sep = ""), paste("*", names, "*", sep = ""))
  rownames(data) <- names
  data
}

# Make a significance table for each figure (top 20 taxa)
make_sig_table <- function(data) {
  sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
  sig_data$min <- apply(sig_data, 1, FUN = min)
  sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
  sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
  sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>%
          column_to_rownames(var = "feature") %>% species_italic %>% select(-c("-")) %>%
          rename(lyPMA = lypma,  Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
  sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame(check.names = F)
  return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}

make_sig_table_neg <- function(data) {
  sig_data <- spread(fit_data_neg[order(fit_data_neg$qval), c("feature", "metadata", "qval")], metadata, qval)
  sig_data$min <- apply(sig_data, 1, FUN = min)
  sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:16,]
  sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
  sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>%
          column_to_rownames(var = "feature") %>% species_italic %>% select(-c("-"))  %>% 
          rename(lyPMA = lypma,  Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
  sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame(check.names = F)
  return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}

fit_data_neg <- make_sig_table_neg(fit_data_neg)
fit_data_pos <- make_sig_table(fit_data_pos)
fit_data_bal <- make_sig_table(fit_data_bal)
fit_data_ns <- make_sig_table(fit_data_ns)
fit_data_spt <- make_sig_table(fit_data_spt)
fit_data_bal_neg <- make_sig_table(fit_data_bal_neg)
fit_data_ns_neg <- make_sig_table(fit_data_ns_neg)
fit_data_spt_neg <- make_sig_table(fit_data_spt_neg)


neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Neg."),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Neg.")) %in% fit_data_neg$data$feature)
fit_data_neg$rel <- cbind(neg_sig %>% otu_table %>% t, neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")


pos_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Mock"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Mock")) %in% fit_data_pos$data$feature)
fit_data_pos$rel <- cbind(pos_sig %>% otu_table %>% t, pos_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_pos$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")


spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Sputum")) %in% fit_data_spt$data$feature)
fit_data_spt$rel <- cbind(spt_sig %>% otu_table %>% t, spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_spt$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

spt_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "Sputum")) %in% fit_data_spt_neg$data$feature)

fit_data_spt_neg$rel <- cbind(spt_neg_sig %>% otu_table %>% t, spt_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_spt_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")


ns_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Nasal")) %in% fit_data_ns$data$feature)
fit_data_ns$rel <- cbind(ns_sig %>% otu_table %>% t, ns_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>%
        .[row.names(fit_data_ns$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

ns_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "Nasal")
                                                  ) %in% fit_data_spt_neg$data$feature)
fit_data_ns_neg$rel <- cbind(ns_neg_sig %>% otu_table %>% t, ns_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_ns_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")



bal_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "BAL")) %in% fit_data_bal$data$feature)
fit_data_bal$rel <- cbind(bal_sig %>% otu_table %>% t, bal_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_bal$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

bal_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "BAL")
                                                  ) %in% fit_data_bal_neg$data$feature)

fit_data_bal_neg$rel <- cbind(bal_neg_sig %>% otu_table %>% t, bal_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% species_italic() %>% data.frame(check.names = F) %>% 
        .[row.names(fit_data_bal_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

#Volcano plot

ggplot(maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
        theme_classic(base_family = "serif") +
        labs(tag = "A") +
        geom_point(size = 2) +
        xlab("MaAslin coefficient") +
        ylab("-log<sub>10</sub>(*q*-value)") +
        geom_hline(yintercept = 1, col = "gray") +
        geom_vline(xintercept = 0, col = "gray") +
        geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
        theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
        scale_color_manual(values = c("#8c510a",  "#c51b7d", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"),
                           breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero",  "molysis", "qiaamp"), 
                           labels = c("log10 (Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero",  "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))

MaAslin - table

feature ~ log10(final reads) + sample type + lyPMA + Benzonase + Host zero + Molysis + QIAamp + (1|subject)

Some taxa were changed due to treatment

Stratified analysis is required.

Table of associations had significant (q < 0.1) result

cat("all association")
## all association
maaslin_all %>% subset(., .$qval < 0.1) %>% arrange(., .$feature) %>% .$metadata %>% table
## .
##         benzonase         host_zero log10.Final_reads             lypma 
##                27                15               100                25 
##           molysis            qiaamp       sample_type 
##                31                15               122
cat("Positive association (increased taxa)")
## Positive association (increased taxa)
maaslin_all %>% subset(., .$qval < 0.1) %>% subset(., .$coef > 0) %>% arrange(., .$feature) %>% .$metadata %>% table
## .
##         benzonase         host_zero log10.Final_reads             lypma 
##                19                 7                99                24 
##           molysis            qiaamp       sample_type 
##                23                 7               111

MaAslin - can’t test global significance of a covariate with multi-level.

(https://forum.biobakery.org/t/global-significance-test-for-multilevel-factor/3061)

Baloon plot - Neg.

No taxa changed after treatment

#Mergeing data into one dataframe
merge(fit_data_neg$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_neg$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_neg$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - Pos.

Taxa decreased after treatment

#Mergeing data into one dataframe
merge(fit_data_pos$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_pos$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_pos$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - BAL

Mean relative abundances of top 20 taxa had low q-values.

No taxa changed after treatment

#Mergeing data into one dataframe
merge(fit_data_bal$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_bal$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_bal$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

#Mergeing data into one dataframe

merge(fit_data_bal_neg$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_bal_neg$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_bal_neg$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - Nasals

Mean relative abundances of top 20 taxa had low q-values.

Some taxa changed after treatment, but nothing was unique

#Mergeing data into one dataframe
merge(fit_data_ns$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_ns$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_ns$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

#Data with negative
merge(fit_data_ns_neg$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_ns_neg$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_ns_neg$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - Sputum

Mean relative abundances of top 20 taxa had low q-values.

Some taxa changed after treatment, but nothing was unique

#Mergeing data into one dataframe
merge(fit_data_spt$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_spt$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_spt$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

#Mergeing data into one dataframe

merge(fit_data_spt_neg$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      fit_data_spt_neg$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(fit_data_spt_neg$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Results

Some taxa were significantly changed after treatment. Among top 20, no taxa observed in only one treatment group. As their emergence were consistent across all treatment groups, they were considered as endogenus.

MaAslin with interaction

feature ~ log10(Final reads) + treatment + sample type + treatment * sample type (1|subject)

Some taxa were treaetment specific, after adjusting interaction of sample type * treatment

#Generating interaction term
#sample_data(phyloseq_rel_nz)$sampletype_treatment <- paste(sample_data(phyloseq_rel_nz)$sample_type, #sample_data(phyloseq_rel_nz)$treatment, sep = "*")

#capture.output(maaslin_interaction = Maaslin2(input_data = otu_table(phyloseq_rel_nz) %>% t %>% data.frame(), 
#                 input_metadata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), 
#                 output = "data", 
#                 fixed_effects = c("sample_type", "log10.Final_reads", "treatment", "sampletype_treatment"), 
#                 transform = "LOG", #default
#                 normalization = "TSS", 
#                 random_effects = c("subject_id"), 
#                 reference = c("sample_type,BAL", "treatment,Untreated", "sampletype_treatment,BAL*Untreated"), 
#                 plot_heatmap = F,
#                 plot_scatter = F))

maaslin_interaction <- read.csv("data/maaslin_interaction.csv")
#interaction term - ggplot
ggplot(maaslin_interaction, aes(y = -log10(qval), x = coef, col = metadata)) +
         theme_classic(base_family = "serif") +
         #labs(tag = "A") +
         ggtitle("MaAslin with interaction term")+
         geom_point(size = 2) +
         xlab("MaAslin coefficient") +
         ylab("-log<sub>10</sub>(*q*-value)") +
         geom_hline(yintercept = 1, col = "gray") +
         geom_vline(xintercept = 0, col = "gray") +
         geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
         theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
         scale_color_manual(values = c("#e41a1c",  "#377eb8", "#4daf4a", "#984ea3")) +
         guides(col = guide_legend(title = "Fixed effects", title.position = "top", nrow = 1))

#Checking number of bugs differentially abundance with interaction term 
cat("Number of differentially abundant bugs by each metadata")
## Number of differentially abundant bugs by each metadata
maaslin_interaction %>% subset(., .$qval < 0.1) %>% .$metadata %>% table()
## .
##    log10.Final_reads          sample_type sampletype_treatment 
##                   38                  146                  560 
##            treatment 
##                  135

MaAsLin interaction analysis

Hypothesis: if a sample is contaminated by some treatment, a change of taxon is likely to be associated with one treatment method

No taxa increased only because of one treatment method

 cat("Some taxa were increased by each treatmment.\n But they are not contaminants, \nif they are present in most of the treatments")
## Some taxa were increased by each treatmment.
##  But they are not contaminants, 
## if they are present in most of the treatments
 maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>% .$feature %>% table %>% data.frame %>% arrange(-Freq) %>% rename(Feature = ".") %>% kbl(format = "html", caption = "Table of taxa differentially abundant by treatment") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa differentially abundant by treatment
Feature Freq
Cryptococcus_gattii_VGI 5
Cryptococcus_gattii_VGII 5
Cryptococcus_neoformans 5
Hydrogenibacillus_schlegelii 5
Kouleothrix_aurantiaca 5
Limnochorda_pilosa 5
Listeria_floridensis 5
Saccharomyces_cerevisiae 5
Saccharomyces_cerevisiae_x_Saccharomyces_kudriavzevii 5
Thermoleophilum_album 5
Acholeplasma_oculi 4
Alkalilimnicola_ehrlichii 4
Bacillus_ginsengihumi 4
Bacillus_intestinalis 4
Brochothrix_campestris 4
Cryptococcus_gattii_VGIII 4
Cupriavidus_sp 4
Cutibacterium_acnes 4
Escherichia_coli 4
Listeria_innocua 4
Listeria_monocytogenes 4
Paludisphaera_borealis 4
Pseudomonas_aeruginosa_group 4
Pseudomonas_formosensis 4
Saccharomyces_kudriavzevii 4
Salmonella_enterica 4
Staphylococcus_schweitzeri 4
Sutterella_parvirubra 4
Thiohalorhabdus_denitrificans 4
Staphylococcus_argenteus 3
Enterococcus_faecalis 2
Brevundimonas_diminuta 1
Dolosigranulum_pigrum 1
Granulicatella_elegans 1
Microbacterium_laevaniformans 1
 cat("Most of taxa were found on most of treatments.")
## Most of taxa were found on most of treatments.
 cat("Some taxa were treatment specific, only to one group")
## Some taxa were treatment specific, only to one group
subset(maaslin_interaction, maaslin_interaction$feature %in%  (maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>%
         .$feature %>% table %>% data.frame %>% subset(., Freq == 1) %>% .$. %>% as.character())) %>% subset(., .$qval < 0.1) %>% select(c("feature", "metadata", "value", "coef", "qval")) %>% remove_rownames() %>% kbl(format = "html", caption = "Table of taxa specific to one treatment group") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa specific to one treatment group
feature metadata value coef qval
Granulicatella_elegans sampletype_treatment Sputum*Control -8.4411029 0.0000000
Microbacterium_laevaniformans sample_type Neg. 9.1206764 0.0000000
Granulicatella_elegans sample_type Sputum 8.2066053 0.0000000
Brevundimonas_diminuta treatment lyPMA 7.3874257 0.0000053
Granulicatella_elegans sampletype_treatment Sputum*QIAamp 4.4759863 0.0000075
Granulicatella_elegans sampletype_treatment Sputum*Host zero 3.8472447 0.0001604
Granulicatella_elegans sampletype_treatment Nasal*QIAamp 4.1672768 0.0001821
Granulicatella_elegans sampletype_treatment Nasal*Host zero 3.3489985 0.0011514
Microbacterium_laevaniformans sampletype_treatment Neg.*Molysis 6.8528828 0.0014263
Brevundimonas_diminuta sampletype_treatment Neg.*Molysis 7.9717485 0.0014673
Granulicatella_elegans sampletype_treatment Nasal*lyPMA 3.4918665 0.0015153
Microbacterium_laevaniformans sampletype_treatment Neg.*Host zero 6.6495802 0.0019716
Granulicatella_elegans treatment Benzonase 3.1206692 0.0028540
Granulicatella_elegans sampletype_treatment Neg.*Benzonase -3.1610441 0.0029075
Granulicatella_elegans sampletype_treatment BAL*Benzonase -3.6395276 0.0044223
Brevundimonas_diminuta sampletype_treatment Neg.*Host zero 7.1426956 0.0049037
Microbacterium_laevaniformans sampletype_treatment Neg.*QIAamp 6.0831853 0.0057000
Brevundimonas_diminuta sampletype_treatment BAL*lyPMA -7.1358804 0.0059883
Brevundimonas_diminuta sample_type Neg. 4.8402240 0.0079955
Brevundimonas_diminuta sampletype_treatment Nasal*lyPMA -6.4798307 0.0101716
Granulicatella_elegans sample_type Nasal -3.8626177 0.0105339
Granulicatella_elegans sampletype_treatment Nasal*Control 3.2170548 0.0106974
Microbacterium_laevaniformans sampletype_treatment Neg.*Benzonase 5.6035401 0.0120596
Dolosigranulum_pigrum sample_type Nasal 11.1588968 0.0129222
Brevundimonas_diminuta sampletype_treatment Sputum*lyPMA -6.2658174 0.0142266
Granulicatella_elegans log10.Final_reads log10.Final_reads 0.5136443 0.0373443
Microbacterium_laevaniformans sampletype_treatment Nasal*lyPMA -4.6907843 0.0378643
Microbacterium_laevaniformans sampletype_treatment Sputum*lyPMA -4.3907626 0.0597485
Dolosigranulum_pigrum treatment lyPMA -2.1094994 0.0645820
Granulicatella_elegans sampletype_treatment BAL*Molysis -2.4603992 0.0830127
Microbacterium_laevaniformans treatment lyPMA 2.7417300 0.0926297

No taxa was increased due to one treatmemnt.

A5 Results:

5.1. Both non-stratified and stratified analysis showed that there were no potential contaminants at species level.

5.2. Molysis may inducted 1 potential contaminants (Streptococcaceae), at family level

5.3. After adding control data, MaAslin needs to be reanalyzed. Adding controls (mock communities) for each treatment group will show more statistically valid results in y ~ log(final reads) + sample_type + treatment, (re = subject_id))

A6. Decontam - stratified by treatment

input of DNA concentration: 16S qPCR data

https://github.com/benjjneb/decontam/issues/33

Ben Callahan: But in the more limited testing on qPCR data the method still seems to work, and other publications report strong patterns of inverse frequency of contaminants using qPCR data - which is the pattern the frequency method relies on.

Both stratified and nonstratified

Strategy:

2.4.1. run decontam for all samples (common contaminants, by extraction)

2.4.2. stratify decontam analysis per each treatment method (contaminants by depletion methods)

Results

decontam - all sample

Listeria floridensis could be a potential contaminant

# Decontam package --------------------------------------------------------

# common contaminants across all the treatment methods
#Decontam - were there any contaminants?#
sample_data(phyloseq$phyloseq_rel)$is.neg <- grepl("Neg", sample_data(phyloseq$phyloseq_rel)$sample_type)
phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_rel, S.obs != 0)
#With all sampels
dec_f_all <- isContaminant(phyloseq_rel_nz, method="frequency", conc="DNA_bac_well")
dec_p_all <- isContaminant(phyloseq_rel_nz, method="prevalence", neg="is.neg", threshold=0.5)
dec_c_all <- isContaminant(phyloseq_rel_nz, method="combined", neg="is.neg", conc = "DNA_bac_well")

cat("decontam frequency - all sample")
## decontam frequency - all sample
dec_f_all %>% subset(.,.$contaminant)
cat("decontam prevalence - all sample")
## decontam prevalence - all sample
dec_p_all %>% subset(.,.$contaminant)
cat("decontam combined - all sample")
## decontam combined - all sample
dec_c_all %>% subset(.,.$contaminant)

decontam - stratified by sample_type

Stratified analysis showed no contaminants in NS and BAL

Sputum may have Corynebacterium pseudodiphtheriticum and Candida albicans as contaminants.

#Stratified by sample type

cat("decontam prevalence - BAL")
## decontam prevalence - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - Nasal")
## decontam prevalence - Nasal
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal", "Neg")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - Sputum")
## decontam prevalence - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg")) %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam frequency - BAL")
## decontam frequency - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Nasal")
## decontam frequency - Nasal
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal", "Neg")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Sputum")
## decontam frequency - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg")) %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - BAL")
## decontam combined - BAL
subset_samples(phyloseq_rel_nz, sample_type %in% c("BAL", "Neg")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Nasal")
## decontam combined - Nasal
subset_samples(phyloseq_rel_nz, sample_type %in% c("Nasal", "Neg")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Sputum")
## decontam combined - Sputum
subset_samples(phyloseq_rel_nz, sample_type %in% c("Sputum", "Neg")) %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)

Stratified analysis showed no contaminants in NS and BAL

Sputum may have Corynebacterium pseudodiphtheriticum and Candida albicans as contaminants.

#Stratified by treatment

cat("decontam prevalence - lyPMA")
## decontam prevalence - lyPMA
subset_samples(phyloseq_rel_nz, treatment = "lypma") %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - lyPMA")
## decontam prevalence - lyPMA
subset_samples(phyloseq_rel_nz, treatment = "lypma") %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam prevalence - lyPMA")
## decontam prevalence - lyPMA
subset_samples(phyloseq_rel_nz, treatment = "lypma") %>%
        isContaminant(., method="prevalence", neg = "is.neg", threshold = 0.5) %>% subset(.,.$contaminant)
cat("decontam frequency - Benzonase")
## decontam frequency - Benzonase
subset_samples(phyloseq_rel_nz, treatment = "benzonase") %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Benzonase")
## decontam frequency - Benzonase
subset_samples(phyloseq_rel_nz, treatment = "benzonase") %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam frequency - Benzonase")
## decontam frequency - Benzonase
subset_samples(phyloseq_rel_nz, treatment = "benzonase") %>%
        isContaminant(method="frequency", conc="DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Molysis")
## decontam combined - Molysis
subset_samples(phyloseq_rel_nz, treatment = "molysis") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Molysis")
## decontam combined - Molysis
subset_samples(phyloseq_rel_nz, treatment = "molysis") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Molysis")
## decontam combined - Molysis
subset_samples(phyloseq_rel_nz, treatment = "molysis") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Host zero")
## decontam combined - Host zero
subset_samples(phyloseq_rel_nz, treatment = "host_zero") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Host zero")
## decontam combined - Host zero
subset_samples(phyloseq_rel_nz, treatment = "host_zero") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - Host zero")
## decontam combined - Host zero
subset_samples(phyloseq_rel_nz, treatment = "host_zero") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - QIAamp")
## decontam combined - QIAamp
subset_samples(phyloseq_rel_nz, treatment = "qiaamp") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - QIAamp")
## decontam combined - QIAamp
subset_samples(phyloseq_rel_nz, treatment = "qiaamp") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)
cat("decontam combined - QIAamp")
## decontam combined - QIAamp
subset_samples(phyloseq_rel_nz, treatment = "qiaamp") %>%
        isContaminant(method="combined", neg="is.neg", conc = "DNA_bac_well") %>% subset(.,.$contaminant)

A6 Results:

6.1. Listeria floridensis could be a potential contaminant

6.2. Else, BAL and NS are free from contaminants, and sputum may have Corynebacterium pseudodiphtheriticum and Candida albicans as contaminants.

Further analysis is required after adding data of controls.

A7. LM of function alpha diversity

sample_data <- sample_data(phyloseq$phyloseq_path_rpkm) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
phyloseq_rel_nz <- subset_samples(phyloseq$phyloseq_path_rpkm, S.obs != 0 & sample_type %in% c("BAL", "Nasal", "Sputum", "Neg.", "Mock"))
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)
sample_data(phyloseq_rel_nz)$sampletype_treatment <- paste(sample_data(phyloseq_rel_nz)$sample_type, sample_data(phyloseq_rel_nz)$treatment, sep = ":")

Figure - Alpha diversity

Alpha diversity of functional analysis reult showed similar pattern with taxa result.

Similar approach was employed.

f4a <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL")), aes(y = S.obs)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Species richness") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "A") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))


f4b <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL")), aes(y = data_shannon)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Shannon") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "B") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4c <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL")), aes(y = data_invsimpson)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Inverse simpson") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "C") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

f4d <-        ggplot(subset(sample_data(phyloseq$phyloseq_path_rpkm), sample_data$sample_type %in% c("Sputum", "Nasal", "BAL")), aes(y = dbp)) +
        geom_boxplot(aes(fill = treatment), lwd = 0.2) +
        #scale_fill_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + # color using viridis
        scale_fill_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Berger-Parker index") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "D") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type) + 
        guides(fill = guide_legend(nrow = 1))

ggarrange(f4a, f4b, f4c, f4d, common.legend = T, align = "hv") # alpha diversity plots

Function richness

Alpha diversity chould be having changes due to treatment.

Both stratified and nonstratified analyses were conducted.

All samples:

S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

S.obs ~ sample_type + log10 (Final_reads) + (1|original_sample)

Function richness (all samples & interaction term) - ANOVA

Interaction term showed high p values. However, it could be due to even effect sample type * treatment. Interaction term will be tested.

sample_data <- sample_data(phyloseq$phyloseq_path_rpkm) %>% data.frame(check.names = F) %>% subset(., !is.nan(.$simpson))
lmer_sob <- lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_sob %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 37549.41 9387.353 4 14.04851 8.565282 0.0010209
treatment 32851.08 6570.217 5 103.61403 5.994849 0.0000653
log10(Final_reads) 90968.23 90968.229 1 113.90994 83.001948 0.0000000
sample_type * treatment 91552.36 4577.618 20 100.80643 4.176746 0.0000008

Function richness (all samples & interaction term)

Effect of some treatment was neutralized by interactin term. Therefore, the association was sample_type specific.

Stratified analysis will be conducted.

lmer(S.obs ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -383.158282 59.984929 74.83486 -6.3875758 0.0000000
Mock 203.918522 52.531891 20.31266 3.8818044 0.0009058
BAL -1.095471 43.958000 27.47243 -0.0249209 0.9802982
Nasal 43.349247 39.476273 21.79227 1.0981089 0.2841421
Sputum 124.694518 41.649510 22.75598 2.9939012 0.0065308
lyPMA 51.382920 31.479656 100.44998 1.6322580 0.1057562
Benzonase -35.158089 24.193720 98.95731 -1.4531907 0.1493347
Host zero -43.729450 24.294952 99.12001 -1.7999398 0.0749123
Molysis -46.214379 24.223462 99.00536 -1.9078355 0.0593101
QIAamp -18.489418 25.344982 99.02304 -0.7295100 0.4674118
log10(Final_reads) 74.222366 8.146868 113.90994 9.1105405 0.0000000
Mock * lyPMA -34.894063 39.269456 101.75242 -0.8885802 0.3763245
BAL * lyPMA -8.566753 39.776838 99.25496 -0.2153704 0.8299205
Nasal * lyPMA 6.268734 38.148085 103.97327 0.1643263 0.8697933
Sputum * lyPMA -10.468558 37.029354 99.20803 -0.2827097 0.7779884
Mock * Benzonase -45.396371 31.423393 98.95087 -1.4446680 0.1517104
BAL * Benzonase 105.298371 35.446466 102.00842 2.9706310 0.0037060
Nasal * Benzonase 38.051696 30.881512 103.41089 1.2321837 0.2206748
Sputum * Benzonase 57.175589 32.538793 99.59139 1.7571515 0.0819651
Mock * Host zero -53.522125 31.480358 99.02168 -1.7001752 0.0922369
BAL * Host zero 143.331632 35.369394 101.89139 4.0524198 0.0000992
Nasal * Host zero 36.754407 31.232445 103.97139 1.1768021 0.2419616
Sputum * Host zero 57.130886 33.900582 101.05164 1.6852480 0.0950266
Mock * Molysis 33.565937 31.450375 98.98447 1.0672667 0.2884475
BAL * Molysis 163.462129 35.740427 102.42825 4.5735920 0.0000135
Nasal * Molysis 75.501680 30.833226 102.90432 2.4487117 0.0160273
Sputum * Molysis 41.704107 35.204240 102.26355 1.1846331 0.2389084
Mock * QIAamp -27.329172 32.280320 98.94583 -0.8466202 0.3992498
BAL * QIAamp 73.424629 36.463347 102.24051 2.0136558 0.0466731
Nasal * QIAamp 15.581713 32.454958 103.38154 0.4801027 0.6321686
Sputum * QIAamp 29.006234 34.256713 100.51284 0.8467314 0.3991564

All terms were significant.

Function richness - stratified (NS)

Some treatment enabled discovering more functions in Nasals

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -37.765323 54.755769 23.34813 -0.6897049 0.4971783
lyPMA 35.228578 11.218262 20.15999 3.1402885 0.0051161
Benzonase 7.066046 10.406810 19.90394 0.6789829 0.5049681
Host zero 30.210307 12.622416 21.05053 2.3933854 0.0260893
Molysis 36.387452 10.412660 19.69726 3.4945397 0.0023246
QIAamp 48.253391 13.356859 20.43457 3.6126300 0.0016903
log10(Final_reads) 27.875498 8.269368 22.47665 3.3709343 0.0026979

Function richness (BAL)

Higher Final reads enables more discovery of functions.

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -619.07837 97.40440 18.90640 -6.3557535 0.0000044
lyPMA 12.81140 33.30997 18.47738 0.3846115 0.7049162
Benzonase 16.06655 35.09574 19.99559 0.4577921 0.6520375
Host zero 39.14552 36.34582 19.96658 1.0770296 0.2943086
Molysis 52.86775 37.17369 19.89387 1.4221819 0.1704638
QIAamp -10.00821 37.29607 19.88056 -0.2683449 0.7911986
log10(Final_reads) 120.04861 17.60959 17.16210 6.8172310 0.0000028

Function richness (sputum)

Sputum showed no changes. This may due to an enrichment of richness in control groups.

lmer(S.obs ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) -138.66543 192.34371 21.48406 -0.7209252 0.4787277
lyPMA 52.01863 27.85307 19.93178 1.8676082 0.0765926
Benzonase 39.40910 35.11082 20.48877 1.1224207 0.2746653
Host zero 47.80613 58.99496 21.05693 0.8103425 0.4268072
Molysis 36.40748 68.78723 21.13504 0.5292767 0.6021291
QIAamp 39.62629 51.21113 20.95872 0.7737826 0.4477001
log10(Final_reads) 53.66176 32.83658 21.34597 1.6342066 0.1168788

Simpson function

Inverse Simpson of all samples:

Inverse Simpson ~ sample_type * treatment + log10 (Final_reads) + (1|original_sample)

Stratified:

Inverse Simpson ~ treatment + log10 (Final_reads) + (1|original_sample)

Inv Simp - ANOVA

p - value = 0.096 for the interaction term. Interaction term will be tested.

lmer_invsimpson <- lmer(data_invsimpson ~ sample_type * treatment + log10 (Final_reads) + (1|subject_id), data = sample_data)
lmer_invsimpson %>% 
        anova() %>% 
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
sample_type 0.4555813 0.1138953 4 7.808402 3.1934129 0.0778088
treatment 0.0924146 0.0184829 5 106.883870 0.5182264 0.7619900
log10(Final_reads) 0.4769971 0.4769971 1 107.468738 13.3741106 0.0003968
sample_type * treatment 1.0819608 0.0540980 20 102.083083 1.5168082 0.0915111

Inv. Simpson (all samples & interaction term)

Sample type specific effect was observed. Stratified anlysis required.

#Simpson

lmer_invsimpson %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.4129598 0.2818527 83.63165 8.5610674 0.0000000
Mock 0.0275772 0.1903493 25.42229 0.1448768 0.8859506
BAL 0.0714714 0.1756825 47.45648 0.4068212 0.6859684
Nasal 0.3474791 0.1480822 33.68398 2.3465278 0.0249738
Sputum 0.4476724 0.1587808 37.79368 2.8194366 0.0076172
lyPMA -0.0167207 0.1784706 102.51973 -0.0936890 0.9255390
Benzonase -0.0476879 0.1380003 99.44655 -0.3455637 0.7304006
Host zero -0.0057907 0.1384873 99.78258 -0.0418138 0.9667307
Molysis -0.1336443 0.1381433 99.54580 -0.9674322 0.3356726
QIAamp 0.1066600 0.1445290 99.58232 0.7379837 0.4622599
log10(Final_reads) -0.1560495 0.0426707 107.46874 -3.6570631 0.0003968
Mock * lyPMA 0.1819456 0.2214136 105.15685 0.8217457 0.4130809
BAL * lyPMA 0.3534296 0.2265502 100.38312 1.5600496 0.1218954
Nasal * lyPMA -0.0800303 0.2135225 107.59502 -0.3748095 0.7085399
Sputum * lyPMA -0.0256880 0.2110019 99.96434 -0.1217432 0.9033468
Mock * Benzonase 0.1552707 0.1792427 99.43323 0.8662595 0.3884333
BAL * Benzonase 0.0900448 0.1994768 106.43493 0.4514049 0.6526162
Nasal * Benzonase 0.0536948 0.1736092 104.99243 0.3092855 0.7577173
Sputum * Benzonase -0.1076592 0.1851264 100.75533 -0.5815444 0.5621724
Mock * Host zero 0.1347771 0.1795167 99.57950 0.7507775 0.4545570
BAL * Host zero 0.2314543 0.1991447 106.20401 1.1622418 0.2477424
Nasal * Host zero -0.0964932 0.1751823 105.94336 -0.5508161 0.5829194
Sputum * Host zero -0.2287391 0.1917129 103.74581 -1.1931334 0.2355386
Mock * Molysis 0.2483938 0.1793725 99.50263 1.3847930 0.1692130
BAL * Molysis 0.4819250 0.2007603 107.25309 2.4004988 0.0180949
Nasal * Molysis -0.0779791 0.1735718 104.68812 -0.4492611 0.6541721
Sputum * Molysis -0.0596846 0.1980538 106.17034 -0.3013557 0.7637327
Mock * QIAamp 0.0428010 0.1841344 99.42283 0.2324443 0.8166708
BAL * QIAamp -0.0561860 0.2049968 106.86142 -0.2740823 0.7845505
Nasal * QIAamp -0.2362964 0.1823044 105.71980 -1.2961638 0.1977415
Sputum * QIAamp -0.3755703 0.1941641 102.64831 -1.9342930 0.0558304

Inverse Simpson - stratified (NS)

Inverse Simpson ~ sample_type + log10 (Final_reads) + (1|original_sample)

Nasal showed changes after Molysis treatment

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|subject_id), data = subset(sample_data, sample_data$sample_type == "Nasal")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.4811470 0.3874040 27.07580 6.4045475 0.0000007
lyPMA -0.0693508 0.0880237 23.97217 -0.7878651 0.4384971
Benzonase -0.0047597 0.0825888 22.71718 -0.0576314 0.9545462
Host zero -0.1435767 0.0962040 26.11397 -1.4924199 0.1475728
Molysis -0.2248548 0.0830164 22.52427 -2.7085608 0.0126705
QIAamp -0.1700352 0.1031617 25.68792 -1.6482392 0.1114825
log10(Final_reads) -0.1131938 0.0589236 27.09916 -1.9210251 0.0653065

Inverse Simpson - stratified (BAL)

No changes found at BAL

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "BAL")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 2.7219275 0.7353785 18.44625 3.7013966 0.0015784
lyPMA 0.3561431 0.2540316 18.47266 1.4019640 0.1775062
Benzonase 0.0775164 0.2662674 19.99882 0.2911223 0.7739557
Host zero 0.2668472 0.2754566 19.90468 0.9687450 0.3442973
Molysis 0.3931670 0.2815568 19.78278 1.3964040 0.1780653
QIAamp 0.0958921 0.2824594 19.76198 0.3394897 0.7378222
log10(Final_reads) -0.1992993 0.1325153 16.23589 -1.5039723 0.1517915

Inverse Simpson - stratified (spt)

Changes associated with deeper sequencing with sputum

lmer(data_invsimpson ~ treatment + log10 (Final_reads) + (1|original_sample), data = subset(sample_data, sample_data$sample_type == "Sputum")) %>% 
        summary() %>%
        .$coefficients %>%
        data.frame(check.names = F) %>% 
        mutate(` ` = case_when(abs(`Pr(>|t|)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        rownames_to_column(var = "x") %>% mutate(x = gsub("treatment|sample_type", "", x)) %>% mutate(x = gsub(":", " * ", x)) %>%
        column_to_rownames(var = "x") %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 4.2492828 0.4271773 20.76851 9.9473516 0.0000000
lyPMA 0.0863071 0.0612866 19.53742 1.4082531 0.1747707
Benzonase 0.0462489 0.0774828 19.90864 0.5968921 0.5573071
Host zero 0.1642745 0.1306066 20.29757 1.2577811 0.2227494
Molysis 0.2809720 0.1523555 20.35230 1.8441861 0.0797637
QIAamp 0.0685141 0.1133096 20.22927 0.6046627 0.5521257
log10(Final_reads) -0.3943791 0.0728223 20.50219 -5.4156359 0.0000245

A8. Function beta diversity

Permanova (Taxa dist ~ log10(final reads) + sample_type + treatment + sample_type * treatment + subject_id) –> both stratified and nonstratified

Beta diversity figure

PCoA based on Bray-Curtis dissimilarities

bray_perm_uni_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + treatment,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_strata <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp,
                            data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F),
                            strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_inter <- vegan::adonis2(distance(phyloseq_rel_nz, method="bray") ~ sample_type * treatment + log10(Final_reads),
                                  data = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F), 
                                  strata = phyloseq_rel_nz %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_ns <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                               data = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>%
                                       sample_data %>% data.frame(check.names = F),
                               strata = subset_samples(phyloseq_rel_nz, sample_type == "Nasal") %>% 
                                       sample_data %>% data.frame(check.names = F) %>% .$subject_id, permutations = 10000)

bray_perm_bal  <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "BAL"), method="bray") ~  lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                 data = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>% sample_data %>% data.frame(check.names = F),
                                 strata = subset_samples(phyloseq_rel_nz, sample_type == "BAL") %>%
                                         sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)

bray_perm_spt <- vegan::adonis2(distance(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"), method="bray") ~ lypma + benzonase + host_zero + molysis + qiaamp + log10(Final_reads),
                                data = subset_samples(phyloseq_rel_nz, sample_type == "Sputum") %>% sample_data %>% data.frame(check.names = F),
                                strata = subset_samples(phyloseq_rel_nz, sample_type == "Sputum")
                                %>% sample_data %>% data.frame(check.names = F) %>% .$subject_id,
                                  permutations = 10000)


ordinate(phyloseq_rel_nz,  method = "PCoA", distance = "bray") %>%
        plot_ordination(phyloseq_rel_nz, ., col = "treatment", shape = "sample_type" ) + 
        #scale_color_viridis(discrete = 6, name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) +
        scale_color_manual(values = c("#e31a1c", "#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("Untreated","lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        scale_shape(name = "Sample type", labels = c("BAL", "Nasal", "Sputum")) +
        geom_point(size = 3) +
        theme_classic (base_size = 12, base_family = "serif") +
        theme(plot.tag = element_text(size = 15), legend.spacing = unit(0, 'cm'), legend.key.height = unit(0.4, "cm")) + #legend.position = c(0.9, 0.4)
        labs(tag = "E")

Beta diversity boxplot (Function)

Distances between samples within each subject. Mean distance between control <-> treatment for each subject

#distances of betadiversity - boxplots
bray_dist_long <- distance(phyloseq_rel_nz, method="bray") %>% as.matrix() %>% melt_dist() #making long data of distance matrices
#Adding sample type and treatment name. 
#this can be also done by merging metadata into the `bray_dist_long`
names <- data.frame(str_split_fixed(bray_dist_long$iso1, "_", 3))
names2 <- data.frame(str_split_fixed(bray_dist_long$iso2, "_", 3))
bray_dist_long$sample_id_1 <- paste(names$X1, names$X2, sep = "_")
bray_dist_long$method_1 <- ifelse(grepl("control", bray_dist_long$iso1),"control", 
                                  ifelse(grepl("lyPMA", bray_dist_long$iso1),"lypma", 
                                         ifelse(grepl("benzonase", bray_dist_long$iso1),"benzonase", 
                                                ifelse(grepl("host", bray_dist_long$iso1),"host_zero", 
                                                       ifelse(grepl("qia", bray_dist_long$iso1),"qiaamp", 
                                                              ifelse(grepl("moly", bray_dist_long$iso1),"molysis", 
                                                                     NA))))))
#Adding data for iso 2 also should be done
bray_dist_long$sample_id_2 <- paste(names2$X1, names2$X2, sep = "_")
bray_dist_long$method_2 <-ifelse(grepl("control", bray_dist_long$iso2),"control", 
                                 ifelse(grepl("lyPMA", bray_dist_long$iso2),"lypma", 
                                        ifelse(grepl("benzonase", bray_dist_long$iso2),"benzonase", 
                                               ifelse(grepl("host", bray_dist_long$iso2),"host_zero", 
                                                      ifelse(grepl("qia", bray_dist_long$iso2),"qiaamp", 
                                                             ifelse(grepl("moly", bray_dist_long$iso2),"molysis", 
                                                                    NA))))))
#subsetting distances of my interest
path_bray_dist_long_within_sampleid <- subset(bray_dist_long, bray_dist_long$sample_id_1 == bray_dist_long$sample_id_2)
path_bray_dist_long_within_sampleid_from_control <- subset(path_bray_dist_long_within_sampleid, path_bray_dist_long_within_sampleid$method_1 == "control" | path_bray_dist_long_within_sampleid$method_2 == "control" )
path_bray_dist_long_within_sampleid_from_control$treatment <- path_bray_dist_long_within_sampleid_from_control$method_1
path_bray_dist_long_within_sampleid_from_control$treatment <- ifelse(path_bray_dist_long_within_sampleid_from_control$treatment == "control", path_bray_dist_long_within_sampleid_from_control$method_2, path_bray_dist_long_within_sampleid_from_control$treatment)
path_bray_dist_long_within_sampleid_from_control$sample_type <- ifelse(grepl("NS", path_bray_dist_long_within_sampleid_from_control$iso1), "nasal_swab",
                                                                  ifelse(grepl("CFB", path_bray_dist_long_within_sampleid_from_control$iso1), "Sputum",
                                                                         ifelse(grepl("BAL", path_bray_dist_long_within_sampleid_from_control$iso1), "BAL", NA)))

label <-  c("BAL","Nasal","Sputum")
names(label) <- c("BAL","nasal_swab","Sputum")

ggplot(path_bray_dist_long_within_sampleid_from_control, aes(y = dist, fill = treatment)) +
        geom_boxplot() +
        #scale_fill_manual(values = c(viridis(6)[2:6])) +
        scale_fill_manual(values = c("#fb9a99", "#33a02c", "#b2df8a", "#1f78b4", "#a6cee3"), name = "Treatment", labels = c("lyPMA", "Benzonase", "Host zero", "Molysis", "QIAaamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        ylab("Sample pair distances") +
        theme_classic (base_size = 12, base_family = "serif") + 
        labs(tag = "F") +
        theme(plot.tag = element_text(size = 15),  axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
        facet_wrap(~sample_type, labeller = labeller(sample_type = label))

Function PERMANOVA test results

Treatment as categorized group

dist ~ sample_type + log10(Final_reads) + treatment, strata = subject

No significant changes were observed.

bray_perm_uni_strata %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html", caption = "Subject id as strata term") %>%
        kable_styling(full_width = 0, html_font = "serif")
Subject id as strata term
Df SumOfSqs R2 F Pr(>F)
Sample type 4 32.217 0.767 133.512 0.000
log10(Final reads) 1 1.138 0.027 18.870 0.000
Treatment 5 0.450 0.011 1.491 0.107
Residual 136 8.204 0.195 NA NA
Total 146 42.010 1.000 NA NA

Function PERMAONVA (detailed treatment)

dist ~ sample_type + log10(Final_reads) + lypma + benzonase + host_zero + molysis + qiaamp, strata = subject

No significant changes were observed.

bray_perm_strata %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 4 32.217 0.767 133.512 0.000
log10(Final reads) 1 1.138 0.027 18.870 0.000
lyPMA 1 0.048 0.001 0.791 0.528
Benzonase 1 0.035 0.001 0.577 0.727
Host zero 1 0.032 0.001 0.534 0.796
Molysis 1 0.134 0.003 2.216 0.063
QIAamp 1 0.201 0.005 3.337 0.024
Residual 136 8.204 0.195 NA NA
Total 146 42.010 1.000 NA NA

QIAamp showed highest changes. But, it could be sample type specific.

Function interaction term

dist ~ sample_type * treatment + log10(Final_reads), strata = subject

Some changes were treatment induced?

We don’t have to run stratified anlysis

bray_perm_inter %>% data.frame(check.names = F) %>% rownames_to_column("row.names") %>% 
        mutate(row.names = case_when(row.names == "sample_type" ~ 'Sample type',
                                     row.names == "treatment" ~ 'Treatment',
                                     row.names == "subject_id" ~ 'Subject',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "sample_type:treatment" ~ 'Sample type * treatment',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
Sample type 4 32.217 0.767 160.103 0
Treatment 5 0.686 0.016 2.729 0
log10(Final reads) 1 0.902 0.021 17.925 0
Sample type * treatment 20 2.369 0.056 2.354 0
Residual 116 5.836 0.139 NA NA
Total 146 42.010 1.000 NA NA

Stratified (NS)

Stratified analysis not matching with boxplot results

bray_perm_ns %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.011 0.042 2.193 0.056
Benzonase 1 0.011 0.040 2.076 0.171
Host zero 1 0.009 0.033 1.728 0.227
Molysis 1 0.014 0.052 2.746 0.100
QIAamp 1 0.054 0.197 10.324 0.000
log10(Final reads) 1 0.028 0.103 5.405 0.019
Residual 28 0.147 0.534 NA NA
Total 34 0.275 1.000 NA NA

Stratified (BAL)

Stratified analysis not matching with boxplot results

bray_perm_bal %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.040 0.020 0.800 0.346
Benzonase 1 0.115 0.058 2.279 0.091
Host zero 1 0.027 0.014 0.543 0.403
Molysis 1 0.180 0.091 3.569 0.047
QIAamp 1 0.001 0.000 0.018 0.942
log10(Final reads) 1 0.603 0.306 11.980 0.078
Residual 20 1.007 0.510 NA NA
Total 26 1.973 1.000 NA NA

Stratified (spt)

Stratified analysis not matching with boxplot results

bray_perm_spt %>% data.frame(check.names = F) %>% rownames_to_column('row.names') %>% 
        mutate(row.names = case_when(row.names == "lypma" ~ 'lyPMA',
                                     row.names == "benzonase" ~ 'Benzonase',
                                     row.names == "host_zero" ~ 'Host zero',
                                     row.names == "molysis" ~ 'Molysis',
                                     row.names == "qiaamp" ~ 'QIAamp',
                                     row.names == "subject_id" ~ 'Subject id',
                                     row.names == "log10(Final_reads)" ~ 'log10(Final reads)',
                                     row.names == "Residual" ~ 'Residual',
                                     row.names == "Total" ~ 'Total')) %>% column_to_rownames('row.names') %>% 
        round(3) %>% mutate(` ` = case_when(abs(`Pr(>F)`) < 0.05 ~ "*",
                               .default = " ")) %>% 
        kbl(format = "html") %>%
        kable_styling(full_width = 0, html_font = "serif")
Df SumOfSqs R2 F Pr(>F)
lyPMA 1 0.017 0.023 3.773 0.062
Benzonase 1 0.002 0.003 0.423 0.576
Host zero 1 0.064 0.089 14.506 0.004
Molysis 1 0.139 0.191 31.291 0.000
QIAamp 1 0.370 0.509 83.348 0.000
log10(Final reads) 1 0.032 0.045 7.303 0.010
Residual 23 0.102 0.141 NA NA
Total 29 0.726 1.000 NA NA

Results:

A9. DA analysis for taxa, by sample type and treatment

Both stratified and nonstratified were conducted.

MaAsLin condition:

Transformation: log transform

Normalization: None - as functional hits were normalized as RPKM already.

https://forum.biobakery.org/t/maaslin-with-shortbred-results-and-panphlan/3102

Results

#DA analysis - MaAslin
sample_data(phyloseq_rel_nz)$log10.Final_reads <- log10(sample_data(phyloseq_rel_nz)$Final_reads)

#Running MaAslin for all sample without decontam
#for taxa differentially abundant by host depletion method, look to see which ones overlap with potential contaminant taxa

# Maaslin - # # y ~ log(final reads) + sample_type + treatment  -----------

#all samples
f_maaslin_all <- read.csv("data/f_maaslin_all.csv")
f_maaslin_interaction <- read.csv("data/f_maaslin_interaction.csv")

f_fit_data_bal <- read.csv("data/f_fit_data_bal.csv")
f_fit_data_spt <- read.csv("data/f_fit_data_spt.csv")
f_fit_data_ns <- read.csv("data/f_fit_data_ns.csv")
f_fit_data_pos <- read.csv("data/f_fit_data_pos.csv")
f_fit_data_neg <- read.csv("data/f_fit_data_neg.csv")

f_fit_data_bal_neg <- read.csv("data/f_fit_data_bal_neg.csv")
f_fit_data_spt_neg <- read.csv("data/f_fit_data_spt_neg.csv")
f_fit_data_ns_neg <- read.csv("data/f_fit_data_ns_neg.csv")

MaAslin without interaction - volcano plot

Again, most of DA functions were sample type specific

#Making significance table for figure
        # Define a function to make species names italicized
# Make a significance table for each figure (top 20 taxa)
make_sig_table <- function(data) {
  sig_data <- spread(data[order(data$qval), c("feature", "metadata", "qval")], metadata, qval)
  sig_data$feature <- gsub("[.]", "-", sig_data$feature)
  sig_data$min <- apply(sig_data, 1, FUN = min)
  sig_data <- sig_data[order(sig_data$min),] %>% select("feature", "lypma", "benzonase", "host_zero", "molysis", "qiaamp") %>% .[1:20,]
  sig_data[["feature"]] <- ifelse(sig_data[["feature"]] == "X.Collinsella._massiliensis", "[Collinsella]_massiliensis", sig_data[["feature"]])
  sig_data_italic <- sig_data %>% rownames_to_column(var = "-") %>%
          column_to_rownames(var = "feature") %>% select(-c("-")) %>%
          rename(lyPMA = lypma,  Benzonase = benzonase, `Host zero` = host_zero, Molysis = molysis, QIAamp = qiaamp)
  sig_data_sig <- ifelse(sig_data_italic < 0.1, "*", NA) %>% data.frame(check.names = F)
  return(list(data = sig_data, data_italic = sig_data_italic, data_sig = sig_data_sig))
}

f_fit_data_neg <- make_sig_table(f_fit_data_neg)
f_fit_data_pos <- make_sig_table(f_fit_data_pos)
f_fit_data_bal <- make_sig_table(f_fit_data_bal)
f_fit_data_ns <- make_sig_table(f_fit_data_ns)
f_fit_data_spt <- make_sig_table(f_fit_data_spt)
f_fit_data_bal_neg <- make_sig_table(f_fit_data_bal_neg)
f_fit_data_ns_neg <- make_sig_table(f_fit_data_ns_neg)
f_fit_data_spt_neg <- make_sig_table(f_fit_data_spt_neg)



f_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Neg."),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Neg.")) %in% f_fit_data_neg$data$feature)
f_fit_data_neg$rel <- cbind(f_neg_sig %>% otu_table %>% t, f_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")


f_pos_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Mock"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Mock")) %in% f_fit_data_pos$data$feature)
f_fit_data_pos$rel <- cbind(f_pos_sig %>% otu_table %>% t, f_pos_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_pos$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")



f_spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Sputum")) %in% f_fit_data_spt$data$feature)
fit_data_spt$rel <- cbind(f_spt_sig %>% otu_table %>% t, f_spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_spt$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

f_spt_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "Sputum")) %in% f_fit_data_spt$data$feature)

f_fit_data_spt$rel <- cbind(f_spt_sig %>% otu_table %>% t, f_spt_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_spt$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")

f_spt_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Sputum"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "Sputum")) %in% f_fit_data_spt_neg$data$feature)

f_fit_data_spt_neg$rel <- cbind(f_spt_neg_sig %>% otu_table %>% t, f_spt_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_spt_neg$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")


f_ns_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "Nasal")) %in% f_fit_data_ns$data$feature)

f_fit_data_ns$rel <- cbind(f_ns_sig %>% otu_table %>% t, f_ns_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>% 
        .[row.names(f_fit_data_ns$data_italic),] %>%  mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")
f_fit_data_ns$rel$feature <- row.names(f_fit_data_ns$data_sig)


f_ns_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "Nasal"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "Nasal")) %in% f_fit_data_spt_neg$data$feature)

f_fit_data_ns_neg$rel <- cbind(f_ns_neg_sig %>% otu_table %>% t, f_ns_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
        .[row.names(f_fit_data_spt_neg$data_italic),] %>%
        mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")



f_bal_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz, sample_type == "BAL")) %in% f_fit_data_bal$data$feature)

f_fit_data_bal$rel <- cbind(f_bal_sig %>% otu_table %>% t, f_bal_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
        .[row.names(f_fit_data_bal$data_italic),] %>%
        mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")




f_bal_neg_sig <- subset_taxa(subset_samples(phyloseq_rel_nz, sample_type == "BAL"),
                                       taxa_names(subset_samples(phyloseq_rel_nz,
                                                                 sample_type == "BAL")
                                                  ) %in% f_fit_data_bal_neg$data$feature)

f_fit_data_bal_neg$rel <- cbind(f_bal_neg_sig %>% otu_table %>% t, f_bal_neg_sig %>% sample_data) %>% group_by(treatment) %>% summarise_if(is.numeric, mean, na.rm = TRUE) %>% .[, 1:21] %>% column_to_rownames(., "treatment") %>% t () %>% data.frame(check.names = F) %>%
        .[row.names(f_fit_data_bal_neg$data_italic),] %>%
        mutate_all(~na_if(., 0)) %>% rownames_to_column("feature")




#Volcano plot

ggplot(f_maaslin_all, aes(y = -log10(qval), x = coef, col = metadata)) +
        theme_classic(base_family = "serif") +
        labs(tag = "A") +
        geom_point(size = 2) +
        xlab("MaAslin coefficient") +
        ylab("-log<sub>10</sub>(*q*-value)") +
        geom_hline(yintercept = 1, col = "gray") +
        geom_vline(xintercept = 0, col = "gray") +
        #geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
        theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
        scale_color_manual(values = c("#4daf4a",  "#984ea3", "#f781bf", "#377eb8", "#ff7f00", "#ffff33", "#a65628"),
                           breaks = c("log10.Final_reads", "sample_type", "lypma", "benzonase", "host_zero",  "molysis", "qiaamp"), 
                           labels = c("log10 (Final reads)", "Sample type", "lyPMA", "Benzonase", "Host zero",  "Molysis", "QIAamp")) + #color using https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=6
        guides(col = guide_legend(title = "Covariates", title.position = "top", nrow = 2))

Most of the DA function were sample type dependent.

MaAsLin table (function)

Large number of functions were differentially aubundant.

f_maaslin_all %>% subset(., .$qval < 0.1) %>% .$metadata %>% table
## .
##         benzonase         host_zero log10.Final_reads             lypma 
##                90               100               368                83 
##           molysis            qiaamp       sample_type 
##                16                36               591

Stratified analysis is required.

Baloon plot - Nasals

Similarly, few functions were newly discovered

merge(f_fit_data_spt$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      f_fit_data_spt$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(f_fit_data_spt$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - BAL

Similarly, few functions were newly discovered

merge(f_fit_data_bal$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      f_fit_data_bal$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(f_fit_data_bal$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Baloon plot - Nasal

Some functions were newly discovered.

merge(f_fit_data_ns$rel %>%
              gather(treatment,
                     value,
                     Untreated:QIAamp,
                     factor_key=TRUE),
      f_fit_data_ns$data_italic %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     qval,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
      by.x = c('feature', 'treatment'),
      by.y = c('feature', 'treatment'),
      all = T) %>%
        
        merge(f_fit_data_ns$data_sig %>%
              rownames_to_column("feature") %>%
              gather(treatment,
                     sig,
                     lyPMA:QIAamp,
                     factor_key=TRUE),
              by.x = c('feature', 'treatment'),
              by.y = c('feature', 'treatment'),
              all = T) %>%
#Baloon plot
        ggballoonplot(size = "value", fill = "qval", y = "feature", x= "treatment") +
        
        theme_classic(base_family = "serif") +
        #colors for qvalues
        gradient_fill(c("#006d2c", "#edf8fb")) +
        xlab("Experimental group") +
        ylab("Species") +
        labs(tag = "D") +
        theme(panel.grid.major = element_line(colour = "grey"),
              legend.position = "top",
              axis.text.x = element_text(vjust = 0.5, hjust=0.5),
              #Element markdown for taxa name italicizing
              axis.text.y = ggtext::element_markdown())  +
        #Adding significance asterisks
        geom_text(aes(y = feature,
                      x = treatment,
                      label = sig,
                      col = "red"),
                  hjust = -2,
                  vjust = 0.8,
                  size = 5) +
        guides(col = guide_legend(nrow = 1,
                                  override.aes = aes(label = "*", size = 10, color = "red"),
                                  title="Significance",
                                  title.position = "top", order = 3, ),
               fill = guide_colorbar(title = c(expression(paste(italic("q"),
                                                       "-value",
                                                       sep = ""))),
                                   title.position = "top",
                                   order = 2),
               size = guide_legend(title = "Relative abundance",
                                   title.position = "top",
                                   order = 1,
                                   nrow = 2),
               ) + 
        scale_x_discrete(labels=c("control" = "Untreated",
                                  "lypma" = "lyPMA",
                                  "benzonase" = "Benzonase",
                                  "host_zero" = "Host-zero",
                                  "molysis" = "Molysis",
                                  "qiaamp" = "QIAamp")
                         ) +
        scale_color_manual(values = c("red"),
                           labels = c(expression(paste(italic("q"),
                                                       "-value < 0.1",
                                                       sep = "")
                                                 )
                                      )
                           )

Results After adding control data, MaAslin needs to be reanalyzed. Adding controls (mock communities) for each treatment group will show more statistically valid results in y ~ log(final reads) + sample_type + treatment, (re = subject_id))

MaAslin with interaction

#interaction term - ggplot
ggplot(f_maaslin_interaction, aes(y = -log10(qval), x = coef, col = metadata)) +
         theme_classic(base_family = "serif") +
         #labs(tag = "A") +
         ggtitle("MaAslin with interaction term")+
         geom_point(size = 2) +
         xlab("MaAslin coefficient") +
         ylab("-log<sub>10</sub>(*q*-value)") +
         geom_hline(yintercept = 1, col = "gray") +
         geom_vline(xintercept = 0, col = "gray") +
         #geom_richtext(aes( 4, 8, label = "*q*-value = 0.1, fold-change = 0", vjust = -1, fontface = 1), col = "grey", size = 3, family = "serif") +
         theme(legend.position = "top", axis.title.y = ggtext::element_markdown()) +
         scale_color_manual(values = c("#e41a1c",  "#377eb8", "#4daf4a", "#984ea3")) +
         guides(col = guide_legend(title = "Fixed effects", title.position = "top", nrow = 1))

#Checking number of bugs differentially abundance with interaction term 
cat("Number of differentially abundant bugs by each metadata")
## Number of differentially abundant bugs by each metadata
maaslin_interaction %>% subset(., .$qval < 0.1) %>% .$metadata %>% table()
## .
##    log10.Final_reads          sample_type sampletype_treatment 
##                   38                  146                  560 
##            treatment 
##                  135

MaAsLin interaction analysis

Some taxa were increased by each treatmment. But they are not contaminants, if they are present in most of the treatments

 f_maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>% .$feature %>% table %>% data.frame %>% arrange(-Freq) %>% rename(Feature = ".") %>% kbl(format = "html", caption = "Table of taxa differentially abundant by treatment") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa differentially abundant by treatment
Feature Freq
PWY.5676 4
COA.PWY 3
PWY.4981 3
PWY.7221 3
PEPTIDOGLYCANSYN.PWY 2
PWY.2942 2
PWY.3781 2
PWY.3841 2
PWY.4242 2
PWY.5097 2
PWY.5188 2
PWY.5189 2
PWY.5659 2
PWY.5675 2
PWY.5686 2
PWY.5913 2
PWY.6163 2
PWY.6385 2
PWY.6386 2
PWY.7184 2
PWY.7199 2
PWY.7219 2
PWY.7237 2
PWY.724 2
PWY0.1586 2
PWY0.166 2
PWY66.389 2
THRESYN.PWY 2
UDPNAGSYN.PWY 2
X1CMET2.PWY 2
ALLANTOINDEG.PWY 1
ANAEROFRUCAT.PWY 1
ANAGLYCOLYSIS.PWY 1
ARG.POLYAMINE.SYN 1
ARGDEG.PWY 1
ARGININE.SYN4.PWY 1
ARGORNPROST.PWY 1
ARO.PWY 1
ASPASN.PWY 1
AST.PWY 1
BIOTIN.BIOSYNTHESIS.PWY 1
BRANCHED.CHAIN.AA.SYN.PWY 1
CALVIN.PWY 1
CITRULBIO.PWY 1
COA.PWY.1 1
COBALSYN.PWY 1
CODH.PWY 1
COLANSYN.PWY 1
COMPLETE.ARO.PWY 1
DENITRIFICATION.PWY 1
DTDPRHAMSYN.PWY 1
FAO.PWY 1
FASYN.INITIAL.PWY 1
FERMENTATION.PWY 1
FOLSYN.PWY 1
FUC.RHAMCAT.PWY 1
GALACT.GLUCUROCAT.PWY 1
GALACTARDEG.PWY 1
GALACTUROCAT.PWY 1
GLUCARDEG.PWY 1
GLUCARGALACTSUPER.PWY 1
GLUCONEO.PWY 1
GLUCOSE1PMETAB.PWY 1
GLUCUROCAT.PWY 1
GLYCOCAT.PWY 1
GLYCOGENSYNTH.PWY 1
GLYCOLYSIS 1
GLYCOLYSIS.E.D 1
GLYCOLYSIS.TCA.GLYOX.BYPASS 1
GLYOXYLATE.BYPASS 1
GOLPDLCAT.PWY 1
HEXITOLDEGSUPER.PWY 1
ILEUSYN.PWY 1
KETOGLUCONMET.PWY 1
LACTOSECAT.PWY 1
LIPASYN.PWY 1
MANNOSYL.CHITO.DOLICHOL.BIOSYNTHESIS 1
METHGLYUT.PWY 1
NADSYN.PWY 1
NONOXIPENT.PWY 1
OANTIGEN.PWY 1
ORNARGDEG.PWY 1
ORNDEG.PWY 1
P105.PWY 1
P122.PWY 1
P124.PWY 1
P125.PWY 1
P164.PWY 1
P165.PWY 1
P185.PWY 1
P221.PWY 1
P4.PWY 1
P42.PWY 1
P441.PWY 1
P461.PWY 1
P562.PWY 1
PANTOSYN.PWY 1
PENTOSE.P.PWY 1
PHOSLIPSYN.PWY 1
POLYAMINSYN3.PWY 1
POLYAMSYN.PWY 1
POLYISOPRENSYN.PWY 1
PPGPPMET.PWY 1
PRPP.PWY 1
PWY.1861 1
PWY.2201 1
PWY.241 1
PWY.2723 1
PWY.2941 1
PWY.3001 1
PWY.3502 1
PWY.3801 1
PWY.4041 1
PWY.4361 1
PWY.4702 1
PWY.4984 1
PWY.5005 1
PWY.5022 1
PWY.5028 1
PWY.5044 1
PWY.5067 1
PWY.5079 1
PWY.5081 1
PWY.5083 1
PWY.5100 1
PWY.5103 1
PWY.5104 1
PWY.5129 1
PWY.5136 1
PWY.5138 1
PWY.5154 1
PWY.5177 1
PWY.5265 1
PWY.5304 1
PWY.5306 1
PWY.5345 1
PWY.5347 1
PWY.5367 1
PWY.5381 1
PWY.5384 1
PWY.5464 1
PWY.5505 1
PWY.5514 1
PWY.561 1
PWY.5651 1
PWY.5656 1
PWY.5667 1
PWY.5690 1
PWY.5692 1
PWY.5705 1
PWY.5723 1
PWY.5747 1
PWY.5791 1
PWY.5837 1
PWY.5838 1
PWY.5840 1
PWY.5850 1
PWY.5855 1
PWY.5856 1
PWY.5857 1
PWY.5860 1
PWY.5861 1
PWY.5863 1
PWY.5870 1
PWY.5871 1
PWY.5872 1
PWY.5873 1
PWY.5897 1
PWY.5898 1
PWY.5899 1
PWY.5910 1
PWY.5918 1
PWY.5920 1
PWY.5941 1
PWY.5971 1
PWY.5973 1
PWY.5989 1
PWY.6075 1
PWY.6113 1
PWY.6125 1
PWY.6147 1
PWY.6168 1
PWY.621 1
PWY.6263 1
PWY.6284 1
PWY.6285 1
PWY.6309 1
PWY.6317 1
PWY.6318 1
PWY.6351 1
PWY.6352 1
PWY.6353 1
PWY.6396 1
PWY.6470 1
PWY.6471 1
PWY.6507 1
PWY.6519 1
PWY.6527 1
PWY.6531 1
PWY.6549 1
PWY.6588 1
PWY.6595 1
PWY.6596 1
PWY.6606 1
PWY.6608 1
PWY.6609 1
PWY.6612 1
PWY.6628 1
PWY.6630 1
PWY.6703 1
PWY.6708 1
PWY.6737 1
PWY.6797 1
PWY.6829 1
PWY.6859 1
PWY.6891 1
PWY.6892 1
PWY.6895 1
PWY.6901 1
PWY.6969 1
PWY.6981 1
PWY.6992 1
PWY.7003 1
PWY.7007 1
PWY.7036 1
PWY.7039 1
PWY.7053 1
PWY.7115 1
PWY.7117 1
PWY.7118 1
PWY.7196 1
PWY.7197 1
PWY.7198 1
PWY.7200 1
PWY.7204 1
PWY.7208 1
PWY.7210 1
PWY.7211 1
PWY.7228 1
PWY.7242 1
PWY.7245 1
PWY.7254 1
PWY.7268 1
PWY.7269 1
PWY.7279 1
PWY.7282 1
PWY.7283 1
PWY.7286 1
PWY.7288 1
PWY.7323 1
PWY.7328 1
PWY.7337 1
PWY.7338 1
PWY.7345 1
PWY.7385 1
PWY.7388 1
PWY.7391 1
PWY.7409 1
PWY.7411 1
PWY.7527 1
PWY.7528 1
PWY.7539 1
PWY.7546 1
PWY.7592 1
PWY.7606 1
PWY.7626 1
PWY.821 1
PWY.922 1
PWY0.1061 1
PWY0.1261 1
PWY0.1297 1
PWY0.1298 1
PWY0.1319 1
PWY0.1479 1
PWY0.162 1
PWY0.42 1
PWY0.781 1
PWY0.845 1
PWY0.881 1
PWY3O.19 1
PWY3O.355 1
PWY4FS.7 1
PWY4FS.8 1
PWY4LZ.257 1
PWY66.201 1
PWY66.367 1
PWY66.388 1
PWY66.391 1
PWY66.398 1
PWY66.400 1
PWY66.409 1
PWY66.422 1
PWYG.321 1
PYRIDNUCSAL.PWY 1
PYRIDNUCSYN.PWY 1
PYRIDOXSYN.PWY 1
REDCITCYC 1
RHAMCAT.PWY 1
RUMP.PWY 1
SALVADEHYPOX.PWY 1
SO4ASSIM.PWY 1
SPHINGOLIPID.SYN.PWY 1
SULFATE.CYS.PWY 1
TCA 1
TCA.GLYOX.BYPASS 1
TEICHOICACID.PWY 1
THISYN.PWY 1
THISYNARA.PWY 1
TRIGLSYN.PWY 1
TRNA.CHARGING.PWY 1
TRPSYN.PWY 1
UBISYN.PWY 1
URDEGR.PWY 1
URSIN.PWY 1
VALDEG.PWY 1
VALSYN.PWY 1
 cat("Most of taxa were found on most of treatments.")
## Most of taxa were found on most of treatments.
 cat("Some taxa were treatment specific, only to one group")
## Some taxa were treatment specific, only to one group
subset(f_maaslin_interaction, f_maaslin_interaction$feature %in%  (f_maaslin_interaction %>% subset(., .$qval < 0.1 & .$metadata == "treatment") %>%
         .$feature %>% table %>% data.frame %>% subset(., Freq == 1) %>% .$. %>% as.character())) %>% subset(., .$qval < 0.1) %>% select(c("feature", "metadata", "value", "coef", "qval")) %>% subset(., .$metadata == "treatment") %>%
        remove_rownames() %>% kbl(format = "html", caption = "Table of taxa specific to one treatment group") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of taxa specific to one treatment group
feature metadata value coef qval
PWY.6606 treatment lyPMA 18.389501 0.0000814
PWY.6595 treatment lyPMA 17.782892 0.0000987
GLUCONEO.PWY treatment lyPMA 17.256004 0.0001390
PWY4FS.7 treatment lyPMA 17.063858 0.0001953
PWY4FS.8 treatment lyPMA 17.064245 0.0001953
PPGPPMET.PWY treatment lyPMA 19.165180 0.0002422
GLYCOLYSIS.TCA.GLYOX.BYPASS treatment lyPMA 16.371591 0.0002469
HEXITOLDEGSUPER.PWY treatment lyPMA 16.055861 0.0002803
PWY.5345 treatment lyPMA 15.277318 0.0003947
ASPASN.PWY treatment lyPMA 16.170863 0.0004366
PWY.6168 treatment lyPMA 14.122654 0.0004461
PWY.6396 treatment lyPMA 13.654288 0.0004865
PWY.5920 treatment lyPMA 15.911001 0.0004913
PWY.5910 treatment lyPMA 14.332183 0.0004925
PWY.7385 treatment lyPMA 15.377686 0.0006016
PWY.5840 treatment lyPMA 15.167528 0.0006230
PWY.5791 treatment lyPMA 15.600813 0.0006884
PWY.5837 treatment lyPMA 15.600813 0.0006884
PWY.5897 treatment lyPMA 15.167533 0.0006950
PWY.5898 treatment lyPMA 15.167533 0.0006950
PWY.5899 treatment lyPMA 15.167533 0.0006950
PWY.6628 treatment lyPMA 16.003259 0.0006950
PWY.922 treatment lyPMA 13.502315 0.0006969
FOLSYN.PWY treatment lyPMA 13.388785 0.0008161
PWY.5028 treatment QIAamp 15.169594 0.0008411
PWY0.1298 treatment lyPMA 16.106334 0.0008777
PWY.6612 treatment lyPMA 13.241923 0.0008917
TCA.GLYOX.BYPASS treatment lyPMA 16.597331 0.0009351
PWY.7198 treatment lyPMA 13.102111 0.0009455
ANAEROFRUCAT.PWY treatment lyPMA 14.827707 0.0009586
P124.PWY treatment lyPMA 14.911091 0.0009864
PWY.5705 treatment lyPMA 13.838064 0.0010004
PWY.5850 treatment lyPMA 13.249212 0.0010205
PWY.7115 treatment lyPMA 14.084357 0.0010205
GOLPDLCAT.PWY treatment lyPMA 15.043431 0.0010247
PWY.561 treatment lyPMA 15.066937 0.0010247
PWY.5860 treatment lyPMA 13.255502 0.0010367
PWY.5505 treatment lyPMA 13.540425 0.0010465
RHAMCAT.PWY treatment lyPMA 14.818486 0.0010661
PWY.5863 treatment lyPMA 14.426545 0.0011214
PWY0.1297 treatment lyPMA 15.277661 0.0011302
TRNA.CHARGING.PWY treatment lyPMA 13.222975 0.0011588
PWY.7210 treatment lyPMA 15.982394 0.0011791
P42.PWY treatment lyPMA 14.902692 0.0011823
FAO.PWY treatment lyPMA 16.967273 0.0012079
PWY.5136 treatment lyPMA 16.772412 0.0012519
PWY.6507 treatment lyPMA 13.449180 0.0012526
PWY.6630 treatment lyPMA 13.835166 0.0012555
PWY4LZ.257 treatment lyPMA 13.509248 0.0013358
ALLANTOINDEG.PWY treatment lyPMA 13.283245 0.0013481
P105.PWY treatment lyPMA 16.191968 0.0013801
PWY.7003 treatment lyPMA 12.875324 0.0013815
POLYISOPRENSYN.PWY treatment lyPMA 14.247257 0.0014108
METHGLYUT.PWY treatment lyPMA 13.714242 0.0014258
P441.PWY treatment lyPMA 13.354373 0.0014724
PWY.6549 treatment lyPMA 13.990292 0.0015392
REDCITCYC treatment lyPMA 14.452201 0.0015392
P122.PWY treatment lyPMA 13.259414 0.0015512
PWY.7269 treatment lyPMA 15.326825 0.0016898
SULFATE.CYS.PWY treatment lyPMA 14.118694 0.0017211
PWY.7211 treatment lyPMA 14.093166 0.0018108
PRPP.PWY treatment lyPMA 12.820969 0.0019102
P185.PWY treatment lyPMA 16.547635 0.0019301
PWY.7391 treatment lyPMA 13.353611 0.0019535
PWY.5104 treatment lyPMA 17.460045 0.0019643
P165.PWY treatment lyPMA 13.724387 0.0020243
PWY.6471 treatment lyPMA 16.564939 0.0020464
PWY.5855 treatment lyPMA 18.708666 0.0020593
PWY.5856 treatment lyPMA 18.708666 0.0020593
PWY.5857 treatment lyPMA 18.708666 0.0020593
PWY.6708 treatment lyPMA 18.708666 0.0020593
PWY.7242 treatment lyPMA 12.851781 0.0023932
PWY.5100 treatment lyPMA 15.788423 0.0023988
PWY.5265 treatment lyPMA 16.109718 0.0024169
PWY.6901 treatment lyPMA 13.828397 0.0025764
PWY.5971 treatment lyPMA 12.739935 0.0026263
PWY.6075 treatment lyPMA 1.885657 0.0026394
PWY.6737 treatment lyPMA 13.331319 0.0026728
PWY0.881 treatment lyPMA 13.003127 0.0027499
PWY0.1479 treatment lyPMA 17.776618 0.0028479
PANTOSYN.PWY treatment lyPMA 14.327664 0.0028724
PWY.6284 treatment lyPMA 12.557201 0.0030359
PENTOSE.P.PWY treatment lyPMA 13.029626 0.0030721
FASYN.INITIAL.PWY treatment lyPMA 16.247650 0.0031298
GALACTUROCAT.PWY treatment lyPMA 12.599580 0.0032876
PWY.5690 treatment lyPMA 18.029333 0.0032928
DTDPRHAMSYN.PWY treatment lyPMA 12.111295 0.0033428
RUMP.PWY treatment lyPMA 14.382113 0.0033428
PWY.4361 treatment lyPMA 11.474191 0.0033463
PWY0.845 treatment lyPMA 16.116011 0.0033463
PWY.5723 treatment lyPMA 13.326350 0.0034085
PWY.6147 treatment QIAamp 15.228135 0.0034161
GLYCOLYSIS.E.D treatment lyPMA 16.204866 0.0035880
PWY.6588 treatment lyPMA 12.009264 0.0036163
PWY.5367 treatment lyPMA 12.381616 0.0036949
OANTIGEN.PWY treatment lyPMA 15.464195 0.0037187
PYRIDOXSYN.PWY treatment lyPMA 16.119948 0.0038151
PWY66.391 treatment lyPMA 12.656688 0.0039580
PWY.6969 treatment lyPMA 17.425122 0.0041174
PWY.7288 treatment lyPMA 12.822882 0.0042189
PWY0.781 treatment lyPMA 15.608321 0.0042231
PWY.5667 treatment lyPMA 13.144588 0.0044231
PWY0.1319 treatment lyPMA 13.143321 0.0044419
PWY.5973 treatment lyPMA 16.844201 0.0045946
GLYOXYLATE.BYPASS treatment lyPMA 16.748078 0.0046480
P4.PWY treatment lyPMA 15.351166 0.0046480
GLYCOLYSIS treatment lyPMA 14.304330 0.0046790
ARG.POLYAMINE.SYN treatment lyPMA 12.773547 0.0047002
PWY0.162 treatment lyPMA 13.058269 0.0047976
LACTOSECAT.PWY treatment lyPMA 15.650513 0.0050178
PWY3O.355 treatment lyPMA 13.007577 0.0052165
POLYAMSYN.PWY treatment lyPMA 12.714902 0.0054444
URSIN.PWY treatment lyPMA 12.896544 0.0058773
PYRIDNUCSAL.PWY treatment lyPMA 11.887690 0.0062175
ANAGLYCOLYSIS.PWY treatment lyPMA 12.627492 0.0062815
PWY.1861 treatment lyPMA 15.272698 0.0063969
PWY66.400 treatment lyPMA 13.005820 0.0064033
TEICHOICACID.PWY treatment lyPMA 15.432104 0.0064033
PWY.5005 treatment lyPMA 12.014463 0.0065655
PYRIDNUCSYN.PWY treatment lyPMA 15.031384 0.0066700
THISYN.PWY treatment lyPMA 12.063966 0.0071132
P461.PWY treatment lyPMA 12.966218 0.0071716
PWYG.321 treatment lyPMA 15.974095 0.0071819
PWY.6895 treatment lyPMA 11.934830 0.0071903
PWY.6859 treatment lyPMA 13.636234 0.0074640
PWY.7279 treatment Host zero 14.583841 0.0074694
PWY.5138 treatment lyPMA 12.756972 0.0076010
PWY0.1261 treatment lyPMA 15.345137 0.0077425
PWY.6608 treatment lyPMA 17.870291 0.0077493
ARGININE.SYN4.PWY treatment lyPMA 14.469480 0.0078212
PWY.4702 treatment lyPMA 11.932389 0.0080317
PHOSLIPSYN.PWY treatment lyPMA 11.974673 0.0080452
PWY.4984 treatment lyPMA 16.543725 0.0080696
PWY.7254 treatment lyPMA 16.469770 0.0082304
PWY.821 treatment lyPMA 12.926731 0.0084266
P221.PWY treatment lyPMA 14.943366 0.0084597
GLYCOGENSYNTH.PWY treatment lyPMA 12.710306 0.0085771
TRPSYN.PWY treatment lyPMA 12.996795 0.0088696
PWY.6353 treatment lyPMA 14.095459 0.0090426
PWY.7323 treatment lyPMA 18.015799 0.0091049
PWY.7388 treatment lyPMA 15.073360 0.0094306
PWY0.1061 treatment lyPMA 12.552240 0.0101117
PWY.6892 treatment lyPMA 12.195731 0.0101317
PWY.5747 treatment lyPMA 16.562231 0.0102062
PWY.6797 treatment lyPMA 12.373885 0.0102583
PWY66.422 treatment lyPMA 14.548810 0.0104973
PWY.6703 treatment lyPMA 17.222837 0.0107580
PWY.6891 treatment lyPMA 12.081943 0.0108591
PWY.6470 treatment lyPMA 13.728480 0.0109622
ARGORNPROST.PWY treatment lyPMA 15.459228 0.0109991
CITRULBIO.PWY treatment lyPMA 15.523020 0.0110532
PWY.7200 treatment lyPMA 15.657602 0.0112937
COLANSYN.PWY treatment lyPMA 16.702069 0.0117643
PWY.2723 treatment lyPMA 12.482434 0.0117817
P562.PWY treatment lyPMA 13.603124 0.0120290
PWY.5306 treatment lyPMA 12.589679 0.0121561
P125.PWY treatment lyPMA 14.801576 0.0125561
GLUCARDEG.PWY treatment lyPMA 12.960754 0.0129790
VALSYN.PWY treatment Host zero 13.069205 0.0130309
PWY.5083 treatment lyPMA 16.449227 0.0130437
GLYCOCAT.PWY treatment lyPMA 12.149412 0.0138727
PWY.7245 treatment lyPMA 13.365305 0.0140118
PWY.6317 treatment lyPMA 12.617950 0.0141544
KETOGLUCONMET.PWY treatment lyPMA 12.555228 0.0146718
COMPLETE.ARO.PWY treatment lyPMA 12.092341 0.0160954
ARO.PWY treatment lyPMA 12.068662 0.0163385
PWY.3001 treatment lyPMA 12.884738 0.0163844
PWY.6519 treatment lyPMA 15.691302 0.0165234
PWY.5692 treatment lyPMA 13.078809 0.0180252
URDEGR.PWY treatment lyPMA 13.078809 0.0180252
PWY.6318 treatment lyPMA 16.523215 0.0181690
PWY66.367 treatment lyPMA 13.089242 0.0186774
VALDEG.PWY treatment lyPMA 12.825092 0.0192811
FERMENTATION.PWY treatment lyPMA 15.708787 0.0193996
BIOTIN.BIOSYNTHESIS.PWY treatment lyPMA 13.895550 0.0200685
PWY.2941 treatment lyPMA 15.067424 0.0204743
GLUCUROCAT.PWY treatment lyPMA 12.591440 0.0218580
TCA treatment lyPMA 14.044790 0.0220336
ARGDEG.PWY treatment lyPMA 13.151693 0.0225968
ORNARGDEG.PWY treatment lyPMA 13.151693 0.0225968
GALACT.GLUCUROCAT.PWY treatment lyPMA 12.875511 0.0228788
GALACTARDEG.PWY treatment lyPMA 12.334075 0.0234447
GLUCARGALACTSUPER.PWY treatment lyPMA 12.334075 0.0234447
PWY.5918 treatment lyPMA 13.226297 0.0239728
COA.PWY.1 treatment QIAamp 11.665149 0.0244971
SALVADEHYPOX.PWY treatment lyPMA 13.776763 0.0252679
PWY.6113 treatment lyPMA 11.517436 0.0275131
POLYAMINSYN3.PWY treatment lyPMA 12.030321 0.0276100
FUC.RHAMCAT.PWY treatment lyPMA 12.152742 0.0276614
THISYNARA.PWY treatment lyPMA 13.676297 0.0301478
PWY.6527 treatment lyPMA 12.408333 0.0306687
PWY66.398 treatment lyPMA 12.940431 0.0307942
CALVIN.PWY treatment lyPMA 14.308542 0.0310434
PWY.6992 treatment lyPMA 11.226064 0.0312757
NONOXIPENT.PWY treatment QIAamp 11.941759 0.0318398
PWY.5464 treatment lyPMA 12.355635 0.0327029
SO4ASSIM.PWY treatment lyPMA 14.174681 0.0331532
PWY.5514 treatment lyPMA 10.173603 0.0335190
PWY.241 treatment lyPMA 15.649721 0.0338503
PWY.5384 treatment lyPMA 12.893289 0.0338503
PWY.6609 treatment Benzonase 12.401066 0.0347366
PWY.7268 treatment lyPMA 12.322190 0.0349421
PWY.7626 treatment lyPMA 10.602569 0.0350276
PWY.6285 treatment lyPMA 12.924402 0.0350335
PWY66.201 treatment lyPMA 10.670948 0.0368275
PWY.5129 treatment lyPMA 11.954992 0.0370265
PWY.7283 treatment lyPMA 11.940412 0.0374430
PWY.7527 treatment lyPMA 12.063949 0.0375495
PWY.7528 treatment lyPMA 10.434383 0.0377512
PWY.7328 treatment lyPMA 12.208034 0.0381543
PWY.5154 treatment lyPMA 16.618450 0.0394516
PWY.6352 treatment lyPMA 11.875163 0.0394516
PWY.7039 treatment lyPMA 11.597086 0.0395028
PWY.6309 treatment lyPMA 11.722748 0.0397332
PWY.7592 treatment lyPMA 11.935561 0.0402630
MANNOSYL.CHITO.DOLICHOL.BIOSYNTHESIS treatment lyPMA 11.651141 0.0404156
TRIGLSYN.PWY treatment lyPMA 11.438796 0.0405052
PWY.7036 treatment lyPMA 11.892469 0.0409554
PWY.7053 treatment lyPMA 11.738858 0.0411152
SPHINGOLIPID.SYN.PWY treatment lyPMA 11.743618 0.0413561
PWY.5081 treatment lyPMA 11.637937 0.0423602
ORNDEG.PWY treatment lyPMA 12.714203 0.0424394
PWY.5022 treatment lyPMA 16.766734 0.0425764
PWY.7546 treatment lyPMA 11.605601 0.0427501
PWY.5304 treatment lyPMA 11.802428 0.0436190
PWY.7606 treatment lyPMA 11.662213 0.0436820
PWY.6829 treatment lyPMA 12.643024 0.0440801
PWY.7539 treatment lyPMA 11.744263 0.0441427
PWY.7208 treatment QIAamp 12.356065 0.0462135
PWY.7007 treatment lyPMA 11.651153 0.0464872
PWY.5838 treatment lyPMA 11.831542 0.0464896
PWY.5067 treatment lyPMA 11.094221 0.0470595
PWY.6125 treatment QIAamp 12.023619 0.0470595
PWY.5861 treatment lyPMA 11.810720 0.0485953
PWY.7196 treatment lyPMA 13.299493 0.0494065
PWY.5381 treatment lyPMA 11.095322 0.0501402
PWY.6531 treatment lyPMA 12.370429 0.0506428
PWY.7337 treatment lyPMA 11.405467 0.0506428
PWY.7338 treatment lyPMA 11.405467 0.0506428
GLUCOSE1PMETAB.PWY treatment lyPMA 11.479911 0.0511422
PWY.5079 treatment lyPMA 10.751863 0.0521130
PWY.4041 treatment lyPMA 11.269646 0.0539760
PWY.7197 treatment QIAamp 12.059866 0.0567052
PWY.7228 treatment QIAamp 11.868023 0.0577075
PWY66.409 treatment lyPMA 10.571484 0.0578939
PWY.3502 treatment lyPMA 11.178828 0.0583357
COBALSYN.PWY treatment lyPMA 10.565551 0.0591426
PWY.6351 treatment lyPMA 11.115878 0.0620525
PWY.7282 treatment lyPMA 10.783924 0.0631644
DENITRIFICATION.PWY treatment lyPMA 13.775001 0.0635578
PWY.5989 treatment lyPMA 12.539258 0.0636017
PWY.7118 treatment lyPMA 11.449722 0.0643642
PWY.6981 treatment lyPMA 10.887056 0.0646027
PWY0.42 treatment lyPMA 12.134184 0.0650029
PWY66.388 treatment lyPMA 11.926335 0.0654707
PWY.5656 treatment lyPMA 11.262756 0.0656139
AST.PWY treatment lyPMA 12.925791 0.0658535
PWY.6263 treatment lyPMA 12.288427 0.0668910
PWY.7411 treatment lyPMA 11.079530 0.0674354
PWY.2201 treatment lyPMA 12.908367 0.0732521
PWY.5347 treatment lyPMA 12.379242 0.0734364
PWY.5103 treatment lyPMA 12.557227 0.0740257
PWY.621 treatment lyPMA 9.678576 0.0754965
PWY.7409 treatment lyPMA 10.501277 0.0762445
BRANCHED.CHAIN.AA.SYN.PWY treatment lyPMA 12.398479 0.0762918
PWY.5177 treatment lyPMA 13.511797 0.0771217
UBISYN.PWY treatment lyPMA 12.011745 0.0835220
P164.PWY treatment lyPMA 12.212886 0.0848672
ILEUSYN.PWY treatment lyPMA 11.886259 0.0850172
PWY.5044 treatment lyPMA 11.048660 0.0866958
PWY.6596 treatment lyPMA 11.052322 0.0866958
PWY.5871 treatment lyPMA 11.062386 0.0878288
PWY.5873 treatment lyPMA 11.062386 0.0878288
CODH.PWY treatment lyPMA 11.487059 0.0888305
PWY.5870 treatment lyPMA 10.936284 0.0894628
PWY.5872 treatment lyPMA 11.176384 0.0894628
PWY.3801 treatment lyPMA 11.671608 0.0895293
PWY3O.19 treatment lyPMA 10.957194 0.0895293
PWY.7345 treatment lyPMA 11.694075 0.0896610
PWY.7286 treatment lyPMA 11.545038 0.0909807
PWY.7117 treatment lyPMA 13.200467 0.0914084
PWY.5651 treatment lyPMA 10.848148 0.0934163
NADSYN.PWY treatment lyPMA 10.777628 0.0938815
LIPASYN.PWY treatment lyPMA 11.291335 0.0940850
PWY.7204 treatment lyPMA 12.252985 0.0940850
PWY.5941 treatment lyPMA 10.557637 0.0977698

Final results summary

Sequencing results

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c("No increase in final reads",
                                                     "No increase in final reads",
                                                     "No increase in final reads"),
                                           Benzonase = c("No decrease in host %",
                                                         "No decrease in host %",
                                                         "No decrease in host %"),
                                           `Host zero` = c(NA,
                                                           NA,
                                                           NA),
                                           Molysis = c("No decrease in host %",
                                                       "High cahnge of failure in library pep",
                                                       NA),
                                           QIAamp = c("No decrease in host %",
                                                      NA,
                                                      "No decrease in host %")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of issues of each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of issues of each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL No increase in final reads No decrease in host % NA No decrease in host % No decrease in host %
Nasal No increase in final reads No decrease in host % NA High cahnge of failure in library pep NA
Sputum No increase in final reads No decrease in host % NA NA No decrease in host %

Diversity changes (taxa)

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c(NA,
                                                           "Beta changed",
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Richness + InvSimp +"),
                                           `Host zero` = c(NA,
                                                           "Richness + InvSimp +",
                                                           NA),
                                           Molysis = c(NA,
                                                           "Richness + InvSimp +",
                                                           "Beta changed"),
                                           QIAamp = c("Beta changed",
                                                           NA,
                                                           "Beta  changed")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of community changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of community changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA Beta changed
Nasal Beta changed NA Richness + InvSimp + Richness + InvSimp + NA
Sputum Shannon + Richness + InvSimp + NA Beta changed Beta changed

Diversity changes (function)

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c(NA,
                                                           NA,
                                                           "Shannon +"),
                                           Benzonase = c(NA,
                                                           NA,
                                                           "Shannon +"),
                                           `Host zero` = c(NA,
                                                           "Richness +",
                                                           "Shannon +"),
                                           Molysis = c(NA,
                                                           "Richness + InvSimp + BPI +",
                                                           "Shannon +"),
                                           QIAamp = c(NA,
                                                           "Richness + Shannon +",
                                                           "Shannon +")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of functional diversity changes induced by each treatment method") %>%
        kable_styling(full_width = 0, html_font = "serif")
Table of functional diversity changes induced by each treatment method
lyPMA Benzonase Host zero Molysis QIAamp
BAL NA NA NA NA NA
Nasal NA NA Richness + Richness + InvSimp + BPI + Richness + Shannon +
Sputum Shannon + Shannon + Shannon + Shannon + Shannon +

Potential contaminants

matrix(nrow=3,ncol=5) %>% data.frame() %>% rename(lyPMA = X1, Benzonase = X2, `Host zero` = X3, Molysis = X4, QIAamp = X5) %>%
        rownames_to_column("x") %>% mutate(x = c("BAL", "Nasal", "Sputum"),
                                           lyPMA = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           Benzonase = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           `Host zero` = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium"),
                                           Molysis = c("Streptococcaceae, Listeria",
                                                       "Streptococcaceae, Listeria",
                                                           "Streptococcaceae, Listeria, Candida, Corynebacterium"),
                                           QIAamp = c("Listeria",
                                                           "Listeria",
                                                           "Listeria, Candida, Corynebacterium")) %>% column_to_rownames("x") %>%
        kbl(format = "html", caption = "Table of potential contaminants identified by decontam and DA analysis") %>%
        kable_styling(full_width = 0, html_font = "serif") %>%
        column_spec(2:6, italic = T) #%>%
Table of potential contaminants identified by decontam and DA analysis
lyPMA Benzonase Host zero Molysis QIAamp
BAL Listeria Listeria Listeria Streptococcaceae, Listeria Listeria
Nasal Listeria Listeria Listeria Streptococcaceae, Listeria Listeria
Sputum Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium Streptococcaceae, Listeria, Candida, Corynebacterium Listeria, Candida, Corynebacterium
  #row_spec(2:3, bold = T)

Conclusion

1. Effect of treatment was sample type specific.

2. Some methods (lyPMA) made samples failing in library prep.

3. One BAL sample failed in sequencing, but most of treatment enabled its sequencing

4. Alpha diversity and beta diversity were changed by some treatment, specific to some sample type.

5. DA analysis and decontam showed there were some potential contaminants

QIAamp for Nasal, host zero for BAL and sputum successfully 1) incrased final reads, 2) lowered host %, and 3) did not change diversity matrices.

Molysis was effective in increasing efficiencies of sequencing sptum, however diversity matrices were significantly changed.

As our study contains potential contaminants, further analysis is required after adding data of controls.

Done.

Bibliography

#===============================================================================
#BTC.LineZero.Footer.1.1.0
#===============================================================================
#R markdown citation generator.
#===============================================================================
#RLB.Dependencies:
#   magrittr, pacman, stringr
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#BTC.Dependencies:
#   LineZero.Header
#===============================================================================
#Generates citations for each explicitly loaded library.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
str_libraries <- c("r", str_libraries)
for (str_libraries in str_libraries) {
    str_libraries |>
        pacman::p_citation() |>
        print(bibtex = FALSE) |>
        capture.output() %>%
        .[-1:-3] %>% .[. != ""] |>
        stringr::str_squish() |>
        stringr::str_replace("_", "") |>
        cat()
    cat("\n")
}
## R Core Team (2022). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Wickham H, Bryan J (2023). readxl: Read Excel Files_. R package version 1.4.2, <https://CRAN.R-project.org/package=readxl>.
## phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R, Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## Rinker, T. W. & Kurkiewicz, D. (2017). pacman: Package Management for R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## Garbett SP, Stephens J, Simonov K, Xie Y, Dong Z, Wickham H, Horner J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
## Oksanen J, Simpson G, Blanchet F, Kindt R, Legendre P, Minchin P, O'Hara R, Solymos P, Stevens M, Szoecs E, Wagner H, Barbour M, Bedward M, Bolker B, Borcard D, Carvalho G, Chirico M, De Caceres M, Durand S, Evangelista H, FitzJohn R, Friendly M, Furneaux B, Hannigan G, Hill M, Lahti L, McGlinn D, Ouellette M, Ribeiro Cunha E, Smith T, Stier A, Ter Braak C, Weedon J (2022). vegan: Community Ecology Package. R package version 2.6-4, <https://CRAN.R-project.org/package=vegan>.
## Leo Lahti et al. microbiome R package. URL: http://microbiome.github.io
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.6.2.
## Davis NM, Proctor D, Holmes SP, Relman DA, Callahan BJ (2017). "Simple statistical identification and removal of contaminant sequences in marker-gene and metagenomics data." bioRxiv_, 221499. doi:10.1101/221499 <https://doi.org/10.1101/221499>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Kassambara A (2023). ggpubr: 'ggplot2' Based Publication Ready Plots. R package version 0.6.0, <https://CRAN.R-project.org/package=ggpubr>.
## Douglas Bates, Martin Maechler, Ben Bolker, Steve Walker (2015). Fitting Linear Mixed-Effects Models Using lme4. Journal of Statistical Software, 67(1), 1-48. doi:10.18637/jss.v067.i01.
## Kuznetsova A, Brockhoff PB, Christensen RHB (2017). "lmerTest Package: Tests in Linear Mixed Effects Models." Journal of Statistical Software, *82*(13), 1-26. doi:10.18637/jss.v082.i13 <https://doi.org/10.18637/jss.v082.i13>.
## Ooms J (2023). writexl: Export Data Frames to Excel 'xlsx' Format_. R package version 1.4.2, <https://CRAN.R-project.org/package=writexl>.
## Gonçalves da Silva A (2017). harrietr: Wrangle Phylogenetic Distance Matrices and Other Utilities. R package version 0.2.3, <https://CRAN.R-project.org/package=harrietr>.
## Mallick H et al. (2020). Multivariable Association in Population-scale Meta-omics Studies, http://huttenhower.sph.harvard.edu/maaslin2. To cite the MaAsLin 2 software, please use: Mallick H, Rahnavard A, McIver LJ (2020). MaAsLin 2: Multivariable Association in Population-scale Meta-omics Studies. R/Bioconductor package, http://huttenhower.sph.harvard.edu/maaslin2.
## Wilke C, Wiernik B (2022). ggtext: Improved Text Rendering Support for 'ggplot2'. R package version 0.1.2, <https://CRAN.R-project.org/package=ggtext>.
## Aphalo P (2022). ggpmisc: Miscellaneous Extensions to 'ggplot2'_. R package version 0.5.2, <https://CRAN.R-project.org/package=ggpmisc>.
## Auguie B (2017). gridExtra: Miscellaneous Functions for "Grid" Graphics. R package version 2.3, <https://CRAN.R-project.org/package=gridExtra>.
## Wood S, Scheipl F (2020). gamm4: Generalized Additive Mixed Models using 'mgcv' and 'lme4'. R package version 0.2-6, <https://CRAN.R-project.org/package=gamm4>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Hadley Wickham (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21(12), 1-20. URL http://www.jstatsoft.org/v21/i12/.
## Zhu H (2021). kableExtra: Construct Complex Table with 'kable' and Pipe Syntax. R package version 1.3.4, <https://CRAN.R-project.org/package=kableExtra>.
## Yihui Xie (2023). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.42. Yihui Xie (2015) Dynamic Documents with R and knitr. 2nd edition. Chapman and Hall/CRC. ISBN 978-1498716963 Yihui Xie (2014) knitr: A Comprehensive Tool for Reproducible Research in R. In Victoria Stodden, Friedrich Leisch and Roger D. Peng, editors, Implementing Reproducible Computational Research. Chapman and Hall/CRC. ISBN 978-1466561595
## Guangchuang Yu. (2022). Data Integration, Manipulation and Visualization of Phylogenetic Trees (1st edition). Chapman and Hall/CRC. Shuangbin Xu, Lin Li, Xiao Luo, Meijun Chen, Wenli Tang, Li Zhan, Zehan Dai, Tommy T. Lam, Yi Guan, Guangchuang Yu. Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data. iMeta 2022, 4(1):e56. doi:10.1002/imt2.56 Guangchuang Yu. Using ggtree to visualize data on tree-like structures. Current Protocols in Bioinformatics, 2020, 69:e96. doi: 10.1002/cpbi.96 Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(2):3041-3043. doi: 10.1093/molbev/msy194 Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## John Fox and Sanford Weisberg (2019). An {R} Companion to Applied Regression, Third Edition. Thousand Oaks CA: Sage. URL: https://socialsciences.mcmaster.ca/jfox/Books/Companion/
#===============================================================================